import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
import plotly.figure_factory as ffdemo
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency
from plotnine import ggplot, aes, geom_histogram, theme_bw, facet_wrap, labs
!pip install plotnine
!pip install plotly
from plotnine import ggplot, geom_point, aes, theme_bw
warnings.filterwarnings('ignore')
# Source: Geeks for Geeks, 2025
# Source: Dave on Data, K-Means clustering, 2024
# source: Dave on Data, Random Forest, 2024
Requirement already satisfied: plotnine in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (0.8.0) Requirement already satisfied: numpy>=1.19.0 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (1.19.2) Requirement already satisfied: pandas>=1.1.0 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (1.1.5) Requirement already satisfied: statsmodels>=0.12.1 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (0.12.2) Requirement already satisfied: patsy>=0.5.1 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (0.5.3) Requirement already satisfied: scipy>=1.5.0 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (1.5.2) Requirement already satisfied: matplotlib>=3.1.1 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (3.1.2) Requirement already satisfied: mizani>=0.7.3 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (0.7.3) Requirement already satisfied: descartes>=1.1.0 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotnine) (1.1.0) Requirement already satisfied: cycler>=0.10 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from matplotlib>=3.1.1->plotnine) (0.11.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from matplotlib>=3.1.1->plotnine) (3.0.4) Requirement already satisfied: python-dateutil>=2.1 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from matplotlib>=3.1.1->plotnine) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from matplotlib>=3.1.1->plotnine) (1.2.0) Requirement already satisfied: palettable in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from mizani>=0.7.3->plotnine) (3.3.2) Requirement already satisfied: pytz>=2017.2 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from pandas>=1.1.0->plotnine) (2021.3) Requirement already satisfied: six in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from patsy>=0.5.1->plotnine) (1.16.0) Requirement already satisfied: plotly in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (5.18.0) Requirement already satisfied: tenacity>=6.2.0 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotly) (8.2.2) Requirement already satisfied: packaging in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from plotly) (21.3) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/jenniferkrohn/anaconda3/lib/python3.6/site-packages (from packaging->plotly) (3.0.4)
parole_revoc = pd.read_csv('Iowa_Parole_Revocations.csv', index_col='Record ID')
parole_revoc.head()
| Offender Number | Record Date | Fiscal Year Revoked | Revocation Type | Status | Sex | Race & Ethnicity | Age | Offense Code | Offense Class | Offense Description | Offense Type | Offense Subtype | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Record ID | |||||||||||||
| 11029 | 20129393 | 05/01/2017 | 2017.0 | 1. Revoked - New Felony/Aggravated Misdemeanor... | Special Sentence | Male | Black | 27 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | Violent | Sex |
| 11028 | 20828060 | 08/04/2021 | 2022.0 | 5. Revoked - Other Violation (No New Arrest) | Parole | Male | White | 30 | 708.8 | D Felony | GOING ARMED WITH INTENT | Violent | Assault |
| 11027 | 232337 | 05/17/2017 | 2017.0 | 1. Revoked - New Felony/Aggravated Misdemeanor... | Parole | Male | Black | 54 | 715A.6(2)(B) | D Felony | UNAUTH. USE OF CREDIT CARD, GREATER THAN $1K, ... | Property | Forgery/Fraud |
| 11026 | 18312314 | 05/15/2018 | 2018.0 | 3. Revoked - New Arrest Pending or Found Not G... | Parole | Male | Hispanic | 35 | 124.411 | Felony - Enhancement to Original Penalty | Cont Subst-2nd or Subseq-not to exceed 3X Pena... | Drug | Trafficking |
| 11025 | 18011120 | 12/15/2016 | 2017.0 | 2. Revoked - Serious Misdemeanor New Conviction | Special Sentence | Male | White | 31 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | Violent | Sex |
probation_revoc = pd.read_csv('Iowa_Probation_Revocations.csv')
probation_revoc.head(1)
| Record ID | Offender Number | Revocation Date | Fiscal Year Revoked | Revocation Type | Status Revoked | Sex | Race & Ethnicity | Age | Offense Classification | Offense Description | Offense Type | Offense Subtype | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 20229803 | 06/15/2022 | 2022.0 | 3. New Arrest | Probation | Male | White | 33 | D Felony | Possession of a Controlled Substance - 3rd and... | Drug | Drug Possession |
released = pd.read_csv('Offenders_Released_from_Iowa_Prisons.csv')
released.head(1)
| Record ID | Offender Number | Release Date | Fiscal Year Released | Closure Type | Supervision Status | Institution Name | Sex | Race & Ethnicity | Age at Release | Offense Code | Offense Classification | Offense Description | Offense Type | Offense Subtype | Jurisdiction | Admission Date | Months Served | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 21964 | 19633526 | 07/20/2018 | 2019.0 | Acquitted/Dismissed | Prison | Fort Dodge Correctional Facility | Male | Hispanic | 27.0 | 811.2(8),A | D Felony | FAILURE TO APPEAR FELONY CHG. | Public Order | Flight/Escape | Muscatine | 09/06/2016 | 22.434211 |
parole_revoc.describe()
| Offender Number | Fiscal Year Revoked | Age | |
|---|---|---|---|
| count | 1.103000e+04 | 10335.000000 | 11030.000000 |
| mean | 1.443323e+07 | 2019.406483 | 35.868903 |
| std | 7.794457e+06 | 2.753595 | 9.965033 |
| min | 4.475000e+03 | 2015.000000 | 17.000000 |
| 25% | 3.404216e+06 | 2017.000000 | 28.000000 |
| 50% | 1.883369e+07 | 2019.000000 | 34.000000 |
| 75% | 1.989378e+07 | 2022.000000 | 42.000000 |
| max | 2.099981e+07 | 2024.000000 | 78.000000 |
parole_revoc.shape
(11030, 13)
parole_revoc.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11030 entries, 11029 to 0 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Offender Number 11030 non-null int64 1 Record Date 11030 non-null object 2 Fiscal Year Revoked 10335 non-null float64 3 Revocation Type 11030 non-null object 4 Status 11030 non-null object 5 Sex 11030 non-null object 6 Race & Ethnicity 11030 non-null object 7 Age 11030 non-null int64 8 Offense Code 11030 non-null object 9 Offense Class 11030 non-null object 10 Offense Description 11030 non-null object 11 Offense Type 11030 non-null object 12 Offense Subtype 11030 non-null object dtypes: float64(1), int64(2), object(10) memory usage: 1.2+ MB
probation_revoc.describe()
| Record ID | Offender Number | Fiscal Year Revoked | Age | |
|---|---|---|---|---|
| count | 17318.000000 | 1.731800e+04 | 16356.000000 | 17318.000000 |
| mean | 8658.500000 | 1.674768e+07 | 2019.241012 | 31.340744 |
| std | 4999.420316 | 6.296836e+06 | 2.857119 | 10.145441 |
| min | 0.000000 | 1.355000e+03 | 2015.000000 | 16.000000 |
| 25% | 4329.250000 | 1.830045e+07 | 2017.000000 | 23.000000 |
| 50% | 8658.500000 | 1.922012e+07 | 2019.000000 | 30.000000 |
| 75% | 12987.750000 | 2.013370e+07 | 2022.000000 | 37.000000 |
| max | 17317.000000 | 2.099946e+07 | 2024.000000 | 77.000000 |
probation_revoc.shape
(17318, 13)
probation_revoc.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17318 entries, 0 to 17317 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Record ID 17318 non-null int64 1 Offender Number 17318 non-null int64 2 Revocation Date 17316 non-null object 3 Fiscal Year Revoked 16356 non-null float64 4 Revocation Type 17318 non-null object 5 Status Revoked 17318 non-null object 6 Sex 17318 non-null object 7 Race & Ethnicity 17317 non-null object 8 Age 17318 non-null int64 9 Offense Classification 17318 non-null object 10 Offense Description 17318 non-null object 11 Offense Type 17318 non-null object 12 Offense Subtype 17318 non-null object dtypes: float64(1), int64(3), object(9) memory usage: 1.7+ MB
released.describe()
| Record ID | Offender Number | Fiscal Year Released | Age at Release | Months Served | |
|---|---|---|---|---|---|
| count | 53412.000000 | 5.341200e+04 | 50662.000000 | 53411.000000 | 53412.000000 |
| mean | 26705.500000 | 1.525203e+07 | 2019.135565 | 36.316339 | 18.180240 |
| std | 15418.860626 | 7.399300e+06 | 2.767619 | 10.837824 | 31.366340 |
| min | 0.000000 | 7.190000e+02 | 2015.000000 | 16.000000 | 0.000000 |
| 25% | 13352.750000 | 3.487614e+06 | 2017.000000 | 28.000000 | 6.250000 |
| 50% | 26705.500000 | 1.898925e+07 | 2019.000000 | 34.000000 | 10.921053 |
| 75% | 40058.250000 | 2.000780e+07 | 2021.000000 | 43.000000 | 18.190789 |
| max | 53411.000000 | 2.988174e+07 | 2024.000000 | 96.000000 | 790.888158 |
released.shape
(53412, 18)
released.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53412 entries, 0 to 53411 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Record ID 53412 non-null int64 1 Offender Number 53412 non-null int64 2 Release Date 53412 non-null object 3 Fiscal Year Released 50662 non-null float64 4 Closure Type 53412 non-null object 5 Supervision Status 53412 non-null object 6 Institution Name 53389 non-null object 7 Sex 53412 non-null object 8 Race & Ethnicity 53412 non-null object 9 Age at Release 53411 non-null float64 10 Offense Code 52013 non-null object 11 Offense Classification 52013 non-null object 12 Offense Description 52013 non-null object 13 Offense Type 52240 non-null object 14 Offense Subtype 52013 non-null object 15 Jurisdiction 52013 non-null object 16 Admission Date 53412 non-null object 17 Months Served 53412 non-null float64 dtypes: float64(3), int64(2), object(13) memory usage: 7.3+ MB
# Count of nulls in released data
released.isna().sum()
Record ID 0 Offender Number 0 Release Date 0 Fiscal Year Released 2750 Closure Type 0 Supervision Status 0 Institution Name 23 Sex 0 Race & Ethnicity 0 Age at Release 1 Offense Code 1399 Offense Classification 1399 Offense Description 1399 Offense Type 1172 Offense Subtype 1399 Jurisdiction 1399 Admission Date 0 Months Served 0 dtype: int64
parole_revoc['Offender Number'].duplicated().any()
True
probation_revoc['Offender Number'].duplicated().any()
True
released['Offender Number'].duplicated().any()
True
# Are offenders in the different dataframes?
parole_revoc.index.isin(probation_revoc['Offender Number']).any() or parole_revoc.index.isin(released['Offender Number']).any()
True
# Why were offenders released from prison?
released['Closure Type'].unique()
array(['Acquitted/Dismissed', 'Death', 'Discharged - Deported',
'Discharged - Expiration of Sentence',
'Discharge - Early Discharge', 'none from supervision',
'Paroled to Detainer - INS', 'Paroled to Detainer - Iowa',
'Paroled to Detainer - Out of State',
'Paroled to Detainer - U.S. Marshall',
'Paroled w/Immediate Discharge', 'Parole Granted',
'Probation Granted - Non Shock',
'Probation Granted - Reconsideration of Sentence',
'Released on Bond', 'Released to Custody of US Bureau of Prisons',
'Released to Custody of US Probation Office',
'Released to OWI Continuum', 'Released to Special Sentence',
'Returned to Sending Jurisdiction', 'Revoked',
'Revoked From Parole - Discharged',
'Revoked From Parole - Work Release Granted',
'Sent Back to Jail (Designated Jail Sites Only)',
'Terminated by Appeal - Conviction Overturned',
'Terminated by Court', 'Work Release Granted',
'Work Unit - Case Manager to Case Manager',
'Work Unit - Field to Field', 'Work Unit - Field to Prison',
'Work Unit - Hospital Appt. Holdover to Prison',
'Work Unit - Interstate Compact to Prison',
'Work Unit - Prison to Corrections Compact',
'Work Unit - Prison to Escape', 'Work Unit - Prison to Field',
'Work Unit - Prison to Hospital Appt. Holdover',
'Work Unit - Prison to Interstate Compact',
'Work Unit - Prison to Prison',
'Work Unit - Prison to Residential',
'Work Unit - Prison to Stayed Under Appeal',
'Work Unit - Residential to Residential',
'Work Unit - Transfer from County Jail Contract'], dtype=object)
# Delete those that have been acquitted:
closure = released['Closure Type'] != 'Acquitted/Dismissed'
released1 = released[closure].copy()
# Round months served
released1['Served Mos Rounded'] = released1['Months Served'].round().astype(int)
released1['Age in Mos'] = released1['Age at Release'] * 12
released1['Convicted Age'] = (released1['Age in Mos'] - released1['Served Mos Rounded'])/12
released1.head()
| Record ID | Offender Number | Release Date | Fiscal Year Released | Closure Type | Supervision Status | Institution Name | Sex | Race & Ethnicity | Age at Release | ... | Offense Classification | Offense Description | Offense Type | Offense Subtype | Jurisdiction | Admission Date | Months Served | Served Mos Rounded | Age in Mos | Convicted Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1 | 1201637 | 03/04/2020 | 2020.0 | Death | Prison | Iowa State Penitentiary | Male | White | 71.0 | ... | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | Marshall | 04/10/1979 | 491.414474 | 491 | 852.0 | 30.083333 |
| 4 | 241 | 2412596 | 01/03/2015 | 2015.0 | Death | Prison | Iowa Medical & Classification Center | Male | White | 65.0 | ... | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | Boone | 12/18/1987 | 324.934211 | 325 | 780.0 | 37.916667 |
| 5 | 387 | 18048653 | 06/19/2016 | 2016.0 | Death | Prison | Iowa Medical & Classification Center | Male | White | 51.0 | ... | B Felony | SEXUAL ABUSE, 2ND DEGREE, 85% | Violent | Sex | Polk | 07/31/2009 | 82.730263 | 83 | 612.0 | 44.083333 |
| 6 | 484 | 161060 | 07/19/2019 | 2020.0 | Death | Prison | Anamosa State Penitentiary | Male | White | 70.0 | ... | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | Linn | 11/18/2013 | 68.059211 | 68 | 840.0 | 64.333333 |
| 7 | 531 | 20094065 | 01/22/2025 | NaN | Death | Prison | Iowa Medical & Classification Center | Male | White | 84.0 | ... | B Felony | MURDER - 2ND DEGREE, 85% | Violent | Murder/Manslaughter | Washington | 02/04/2013 | 143.750000 | 144 | 1008.0 | 72.000000 |
5 rows × 21 columns
null_convict_age = released1[released1[['Convicted Age']].isnull().any(axis=1)]
null_convict_age
| Record ID | Offender Number | Release Date | Fiscal Year Released | Closure Type | Supervision Status | Institution Name | Sex | Race & Ethnicity | Age at Release | ... | Offense Classification | Offense Description | Offense Type | Offense Subtype | Jurisdiction | Admission Date | Months Served | Served Mos Rounded | Age in Mos | Convicted Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 32716 | 8459 | 29702621 | 05/11/2016 | 2016.0 | Released to Custody of US Bureau of Prisons | Prison Safekeeper | Iowa Correctional Institution for Women | Unknown | Unknown | NaN | ... | Other Felony | COMPACT PRISONER | Other | Other Criminal | Federal | 01/11/2002 | 172.171053 | 172 | NaN | NaN |
1 rows × 21 columns
# Drop all of those that are in prison for safety of themselves or others
released1 = released1.drop(released1[released1['Supervision Status'] == 'Prison Safekeeper'].index)
released1.shape
(52002, 21)
# Round the convicted age to nearest integer
released1['Convicted Age'] = released1['Convicted Age'].round().astype(int)
released1.head()
| Record ID | Offender Number | Release Date | Fiscal Year Released | Closure Type | Supervision Status | Institution Name | Sex | Race & Ethnicity | Age at Release | ... | Offense Classification | Offense Description | Offense Type | Offense Subtype | Jurisdiction | Admission Date | Months Served | Served Mos Rounded | Age in Mos | Convicted Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1 | 1201637 | 03/04/2020 | 2020.0 | Death | Prison | Iowa State Penitentiary | Male | White | 71.0 | ... | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | Marshall | 04/10/1979 | 491.414474 | 491 | 852.0 | 30 |
| 4 | 241 | 2412596 | 01/03/2015 | 2015.0 | Death | Prison | Iowa Medical & Classification Center | Male | White | 65.0 | ... | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | Boone | 12/18/1987 | 324.934211 | 325 | 780.0 | 38 |
| 5 | 387 | 18048653 | 06/19/2016 | 2016.0 | Death | Prison | Iowa Medical & Classification Center | Male | White | 51.0 | ... | B Felony | SEXUAL ABUSE, 2ND DEGREE, 85% | Violent | Sex | Polk | 07/31/2009 | 82.730263 | 83 | 612.0 | 44 |
| 6 | 484 | 161060 | 07/19/2019 | 2020.0 | Death | Prison | Anamosa State Penitentiary | Male | White | 70.0 | ... | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | Linn | 11/18/2013 | 68.059211 | 68 | 840.0 | 64 |
| 7 | 531 | 20094065 | 01/22/2025 | NaN | Death | Prison | Iowa Medical & Classification Center | Male | White | 84.0 | ... | B Felony | MURDER - 2ND DEGREE, 85% | Violent | Murder/Manslaughter | Washington | 02/04/2013 | 143.750000 | 144 | 1008.0 | 72 |
5 rows × 21 columns
# Make a copy in case I make an error
released1_org = released1.copy()
parole_revoc_org = parole_revoc.copy()
probation_revoc_org = probation_revoc.copy()
# Drop institution and jurisdiction columns
released1 = released1.drop(columns=['Institution Name ', 'Jurisdiction'])
# I want to move the offense code to the end
parole_revoc = parole_revoc[['Offender Number', 'Record Date', 'Fiscal Year Revoked',
'Revocation Type', 'Status', 'Sex', 'Race & Ethnicity', 'Age',
'Offense Class', 'Offense Description', 'Offense Type',
'Offense Subtype', 'Offense Code']]
parole_revoc.head(1)
| Offender Number | Record Date | Fiscal Year Revoked | Revocation Type | Status | Sex | Race & Ethnicity | Age | Offense Class | Offense Description | Offense Type | Offense Subtype | Offense Code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Record ID | |||||||||||||
| 11029 | 20129393 | 05/01/2017 | 2017.0 | 1. Revoked - New Felony/Aggravated Misdemeanor... | Special Sentence | Male | Black | 27 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | Violent | Sex | 903B.1 |
# 1st combining Parole and Probation dataframes
concat1 = pd.concat([parole_revoc, probation_revoc], ignore_index=True)
# In preparation to merge, adding r to every column title and then drop it from offender number
released1.columns = [col + '_r' for col in released1.columns]
released1.head(1)
| Record ID_r | Offender Number_r | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | Offense Type_r | Offense Subtype_r | Admission Date_r | Months Served_r | Served Mos Rounded_r | Age in Mos_r | Convicted Age_r | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1 | 1201637 | 03/04/2020 | 2020.0 | Death | Prison | Male | White | 71.0 | 707.2 | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | 04/10/1979 | 491.414474 | 491 | 852.0 | 30 |
released1.rename(columns={'Offender Number_r': 'Offender Number'}, inplace=True)
released1.head(1)
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | Offense Type_r | Offense Subtype_r | Admission Date_r | Months Served_r | Served Mos Rounded_r | Age in Mos_r | Convicted Age_r | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1 | 1201637 | 03/04/2020 | 2020.0 | Death | Prison | Male | White | 71.0 | 707.2 | A Felony | MURDER 1ST DEGREE | Violent | Murder/Manslaughter | 04/10/1979 | 491.414474 | 491 | 852.0 | 30 |
merged1 = pd.merge(released1, concat1, on='Offender Number', how='outer')
merged1.head(1)
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | ... | Age | Offense Class | Offense Description | Offense Type | Offense Subtype | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1201637 | 03/04/2020 | 2020.0 | Death | Prison | Male | White | 71.0 | 707.2 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 rows × 35 columns
# Make a copy of the merged dataframe
merged1_org = merged1.copy()
merged1['Sex'] = merged1['Sex'].fillna(merged1['Sex_r'])
merged1['Race & Ethnicity'] = merged1['Race & Ethnicity'].fillna(merged1['Race & Ethnicity_r'])
merged1['FY'] = merged1['Fiscal Year Released_r']
merged1['FY'] = merged1['Fiscal Year Revoked'].fillna(merged1['FY'])
find_nulls = merged1.loc[merged1['FY'].isnull()]
find_nulls
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | ... | Offense Class | Offense Description | Offense Type | Offense Subtype | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 531.0 | 20094065 | 01/22/2025 | NaN | Death | Prison | Male | White | 84.0 | 707.3 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12 | 2923.0 | 166379 | 10/18/2024 | NaN | Death | Prison | Male | White | 60.0 | 707.2 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 22 | 5482.0 | 18143516 | 01/29/2025 | NaN | Death | Prison Compact | Male | Hispanic | 62.0 | 9999 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 39 | 8686.0 | 313136 | 02/12/2025 | NaN | Death | Prison | Male | Black | 67.0 | 707.2 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 53 | 10603.0 | 20987489 | 02/04/2025 | NaN | Death | Prison | Male | White | 53.0 | 709.3 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 77454 | NaN | 18101756 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | ELUDING-Speed in Excess of 25 mph Over Limit, ... | Public Order | Traffic | NaN | 17139.0 | 03/04/2025 | Probation | D Felony | NaN |
| 77462 | NaN | 19525397 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | Stalking - Violation of Protective Order | Violent | Assault | NaN | 17221.0 | 01/23/2025 | Probation | C Felony | NaN |
| 77464 | NaN | 20738345 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | Possession of a Controlled Substance - 3rd and... | Drug | Drug Possession | NaN | 17226.0 | 07/11/2024 | Probation | D Felony | NaN |
| 77466 | NaN | 18088055 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | Possession of a Controlled Substance - 3rd and... | Drug | Drug Possession | NaN | 17247.0 | 10/03/2024 | Probation | D Felony | NaN |
| 77467 | NaN | 3366848 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | ELUDING-Speed in Excess of 25 mph Over Limit, ... | Public Order | Traffic | NaN | 17253.0 | 01/28/2025 | Probation | D Felony | NaN |
1554 rows × 36 columns
# Need to convert to date obj for next step
merged1['Release Date_r'] = pd.to_datetime(merged1['Release Date_r'])
# Convert to FY, Iowa's Fiscal Year is July 1 - June 30
merged1['FY_r'] = merged1['Release Date_r'].dt.to_period('Q-JUN').dt.qyear
# Source: Stack Overflow, 2019
# Carry over to the nulls
merged1['FY'] = merged1['FY_r'].fillna(merged1['FY'])
merged1['Revocation Date'] = pd.to_datetime(merged1['Revocation Date'])
# Can overwrite the column again
merged1['FY_r'] = merged1['Release Date_r'].dt.to_period('Q-JUN').dt.qyear
merged1['FY'] = merged1['FY_r'].fillna(merged1['FY'])
merged1['off_combo_r'] = merged1['Offense Classification_r'] + ' - ' + merged1['Offense Subtype_r']
merged1['off_combo'] = merged1['Offense Classification'] + ' - ' + merged1['Offense Subtype']
merged1['offense_classification'] = np.nan
merged1['offense_classification'] = merged1['offense_classification'].fillna(merged1['off_combo_r'])
col_val_list = merged1['offense_classification'].unique()
col_val_list
array(['A Felony - Murder/Manslaughter', 'B Felony - Sex',
'B Felony - Murder/Manslaughter', 'A Felony - Kidnap',
'C Felony - Assault', 'Other Felony - Other Criminal',
'Aggravated Misdemeanor - Theft', 'C Felony - Sex',
'NCIC Virtual Code - Other Violent',
'C Felony - Murder/Manslaughter', 'C Felony - Theft',
'Felony - Enhancement to Original Penalty - Sex',
'Other Felony - Murder/Manslaughter', 'D Felony - Other Criminal',
'B Felony - Robbery', 'C Felony - Trafficking',
'D Felony - Drug Possession', 'D Felony - Burglary',
'Felony - Enhancement to Original Penalty - Trafficking',
'Special Sentence 2005 - Sex', 'C Felony - Burglary',
'Other Felony - Sex', 'A Felony - Sex',
'Aggravated Misdemeanor - OWI', 'C Felony - Robbery',
'D Felony - OWI', 'Aggravated Misdemeanor - Assault',
'C Felony - Vandalism', 'D Felony - Assault',
'D Felony - Forgery/Fraud', 'D Felony - Weapons',
'B Felony - Other Violent', 'B Felony - Trafficking',
'B Felony - Kidnap', 'D Felony - Other Public Order',
'D Felony - Theft', 'Aggravated Misdemeanor - Other Criminal',
'Aggravated Misdemeanor - Drug Possession', 'B Felony - Arson',
'Aggravated Misdemeanor - Traffic', 'B Felony - Other Criminal',
'C Felony - Arson', 'C Felony - Other Violent',
'NCIC Virtual Code - Murder/Manslaughter',
'D Felony - Trafficking', 'Aggravated Misdemeanor - Animals',
'Serious Misdemeanor - Assault',
'Aggravated Misdemeanor - Burglary',
'Aggravated Misdemeanor - Other Violent',
'Aggravated Misdemeanor - Alcohol', 'D Felony - Other Violent',
'Aggravated Misdemeanor - Weapons', 'D Felony - Traffic',
'D Felony - Other Drug', 'Aggravated Misdemeanor - Sex',
'D Felony - Kidnap', 'D Felony - Flight/Escape',
'Aggravated Misdemeanor - Other Public Order',
'Aggravated Misdemeanor - Vandalism',
'Aggravated Misdemeanor - Forgery/Fraud',
'Aggravated Misdemeanor - Prostitution/Pimping',
'D Felony - Vandalism', 'Serious Misdemeanor - Theft',
'D Felony - Murder/Manslaughter', 'C Felony - Other Criminal',
'D Felony - Arson', 'Aggravated Misdemeanor - Arson',
'C Felony - Other Public Order',
'Serious Misdemeanor - Drug Possession', 'C Felony - Traffic',
'Aggravated Misdemeanor - Kidnap', 'C Felony - Kidnap',
'D Felony - Sex', 'Serious Misdemeanor - Other Property',
'Aggravated Misdemeanor - Trafficking',
'Serious Misdemeanor - Other Public Order',
'Aggravated Misdemeanor - Robbery',
'Serious Misdemeanor - Flight/Escape',
'Serious Misdemeanor - Traffic',
'Aggravated Misdemeanor - Murder/Manslaughter',
'Serious Misdemeanor - Vandalism', 'Serious Misdemeanor - OWI',
'Simple Misdemeanor - Other Public Order',
'C Felony - Forgery/Fraud', 'C Felony - Other Drug',
'Felony - Mandatory Minimum - Sex', 'Serious Misdemeanor - Arson',
'Serious Misdemeanor - Sex', 'Serious Misdemeanor - Alcohol',
'C Felony - Weapons', nan,
'Aggravated Misdemeanor - Flight/Escape',
'Felony - Mandatory Minimum - Other Government',
'D Felony - Prostitution/Pimping',
'Felony - Enhancement to Original Penalty - Other Criminal',
'NCIC Virtual Code - Other Drug', 'C Felony - Drug Possession',
'Aggravated Misdemeanor - Other Drug',
'Simple Misdemeanor - Burglary', 'Serious Misdemeanor - Weapons',
'D Felony - Stolen Property', 'Other Felony - Trafficking',
'B Felony - Assault', 'Serious Misdemeanor - Other Drug',
'C Felony - Stolen Property', 'NCIC Virtual Code - Weapons',
'NCIC Virtual Code - Assault',
'NCIC Virtual Code - Other Criminal', 'NCIC Virtual Code - Sex',
'NCIC Virtual Code - Burglary',
'NCIC Virtual Code - Stolen Property',
'NCIC Virtual Code - Robbery', 'NCIC Virtual Code - Kidnap',
'NCIC Virtual Code - Forgery/Fraud'], dtype=object)
merged1['offense_classification'].value_counts(dropna=False)
C Felony - Trafficking 9897
D Felony - Burglary 5442
D Felony - Drug Possession 4652
D Felony - Assault 4000
D Felony - Theft 3890
...
Serious Misdemeanor - Traffic 1
Serious Misdemeanor - Other Property 1
Simple Misdemeanor - Burglary 1
Serious Misdemeanor - Weapons 1
Felony - Mandatory Minimum - Other Government 1
Name: offense_classification, Length: 114, dtype: int64
merged1['offense_classification'] = merged1['offense_classification'].fillna(merged1['off_combo'])
merged1['off_combo_l'] = merged1['Offense Classification'] + ' - ' + merged1['Offense Type']
merged1['offense_classification'] = merged1['offense_classification'].fillna(merged1['off_combo_l'])
merged1['off_combo_s'] = merged1['Offense Class'] + ' - ' + merged1['Offense Subtype']
merged1['offense_classification'] = merged1['offense_classification'].fillna(merged1['off_combo_s'])
find_nulls = merged1.loc[merged1['offense_classification'].isnull()]
find_nulls # 2 nulls
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | ... | Revocation Date | Status Revoked | Offense Classification | FY | FY_r | off_combo_r | off_combo | offense_classification | off_combo_l | off_combo_s | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12023 | 24766.0 | 3079070 | 2015-02-10 | 2015.0 | Returned to Sending Jurisdiction | Prison | Male | White | 45.0 | NaN | ... | NaT | NaN | NaN | 2015 | 2015 | NaN | NaN | NaN | NaN | NaN |
| 67707 | 12976.0 | 20423309 | 2018-05-23 | 2018.0 | Returned to Sending Jurisdiction | Prison | Male | White | 19.0 | NaN | ... | NaT | NaN | NaN | 2018 | 2018 | NaN | NaN | NaN | NaN | NaN |
2 rows × 42 columns
col_val_list = find_nulls['Supervision Status_r'].unique()
col_val_list
# What are the 'Prison' population of the nulls?
array(['Prison'], dtype=object)
find_nulls[find_nulls['Supervision Status_r'] == 'Prison']
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | ... | Revocation Date | Status Revoked | Offense Classification | FY | FY_r | off_combo_r | off_combo | offense_classification | off_combo_l | off_combo_s | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12023 | 24766.0 | 3079070 | 2015-02-10 | 2015.0 | Returned to Sending Jurisdiction | Prison | Male | White | 45.0 | NaN | ... | NaT | NaN | NaN | 2015 | 2015 | NaN | NaN | NaN | NaN | NaN |
| 67707 | 12976.0 | 20423309 | 2018-05-23 | 2018.0 | Returned to Sending Jurisdiction | Prison | Male | White | 19.0 | NaN | ... | NaT | NaN | NaN | 2018 | 2018 | NaN | NaN | NaN | NaN | NaN |
2 rows × 42 columns
# What are the remaining nulls?
col_val_list = find_nulls['Closure Type_r'].unique()
print(col_val_list)
# to finish out the offense_classification column, I'll take these and add unknown to first of category
# 'Returned to Sending Jurisdiction'so details are unknown
['Returned to Sending Jurisdiction']
merged1['off_combo_u'] = 'Unknown' + ' - ' + merged1['Closure Type_r']
# Fill the NAs
merged1['offense_classification'] = merged1['offense_classification'].fillna(merged1['off_combo_u'])
# Reran all find nulls - all are gone
col_val_list = merged1['offense_classification'].unique()
col_val_list
array(['A Felony - Murder/Manslaughter', 'B Felony - Sex',
'B Felony - Murder/Manslaughter', 'A Felony - Kidnap',
'C Felony - Assault', 'Other Felony - Other Criminal',
'Aggravated Misdemeanor - Theft', 'C Felony - Sex',
'NCIC Virtual Code - Other Violent',
'C Felony - Murder/Manslaughter', 'C Felony - Theft',
'Felony - Enhancement to Original Penalty - Sex',
'Other Felony - Murder/Manslaughter', 'D Felony - Other Criminal',
'B Felony - Robbery', 'C Felony - Trafficking',
'D Felony - Drug Possession', 'D Felony - Burglary',
'Felony - Enhancement to Original Penalty - Trafficking',
'Special Sentence 2005 - Sex', 'C Felony - Burglary',
'Other Felony - Sex', 'A Felony - Sex',
'Aggravated Misdemeanor - OWI', 'C Felony - Robbery',
'D Felony - OWI', 'Aggravated Misdemeanor - Assault',
'C Felony - Vandalism', 'D Felony - Assault',
'D Felony - Forgery/Fraud', 'D Felony - Weapons',
'B Felony - Other Violent', 'B Felony - Trafficking',
'B Felony - Kidnap', 'D Felony - Other Public Order',
'D Felony - Theft', 'Aggravated Misdemeanor - Other Criminal',
'Aggravated Misdemeanor - Drug Possession', 'B Felony - Arson',
'Aggravated Misdemeanor - Traffic', 'B Felony - Other Criminal',
'C Felony - Arson', 'C Felony - Other Violent',
'NCIC Virtual Code - Murder/Manslaughter',
'D Felony - Trafficking', 'Aggravated Misdemeanor - Animals',
'Serious Misdemeanor - Assault',
'Aggravated Misdemeanor - Burglary',
'Aggravated Misdemeanor - Other Violent',
'Aggravated Misdemeanor - Alcohol', 'D Felony - Other Violent',
'Aggravated Misdemeanor - Weapons', 'D Felony - Traffic',
'D Felony - Other Drug', 'Aggravated Misdemeanor - Sex',
'D Felony - Kidnap', 'D Felony - Flight/Escape',
'Aggravated Misdemeanor - Other Public Order',
'Aggravated Misdemeanor - Vandalism',
'Aggravated Misdemeanor - Forgery/Fraud',
'Aggravated Misdemeanor - Prostitution/Pimping',
'D Felony - Vandalism', 'Serious Misdemeanor - Theft',
'D Felony - Murder/Manslaughter', 'C Felony - Other Criminal',
'D Felony - Arson', 'Aggravated Misdemeanor - Arson',
'C Felony - Other Public Order',
'Serious Misdemeanor - Drug Possession', 'C Felony - Traffic',
'Aggravated Misdemeanor - Kidnap', 'C Felony - Kidnap',
'D Felony - Sex', 'Serious Misdemeanor - Other Property',
'Aggravated Misdemeanor - Trafficking',
'Serious Misdemeanor - Other Public Order',
'Aggravated Misdemeanor - Robbery',
'Serious Misdemeanor - Flight/Escape',
'Serious Misdemeanor - Traffic',
'Aggravated Misdemeanor - Murder/Manslaughter',
'Serious Misdemeanor - Vandalism', 'Serious Misdemeanor - OWI',
'Simple Misdemeanor - Other Public Order',
'C Felony - Forgery/Fraud', 'C Felony - Other Drug',
'Felony - Mandatory Minimum - Sex', 'Serious Misdemeanor - Arson',
'Serious Misdemeanor - Sex', 'Serious Misdemeanor - Alcohol',
'C Felony - Weapons', 'Unknown - Returned to Sending Jurisdiction',
'Aggravated Misdemeanor - Flight/Escape',
'Felony - Mandatory Minimum - Other Government',
'D Felony - Prostitution/Pimping',
'Felony - Enhancement to Original Penalty - Other Criminal',
'NCIC Virtual Code - Other Drug', 'C Felony - Drug Possession',
'Aggravated Misdemeanor - Other Drug',
'Simple Misdemeanor - Burglary', 'Serious Misdemeanor - Weapons',
'D Felony - Stolen Property', 'Other Felony - Trafficking',
'B Felony - Assault', 'Serious Misdemeanor - Other Drug',
'C Felony - Stolen Property', 'NCIC Virtual Code - Weapons',
'NCIC Virtual Code - Assault',
'NCIC Virtual Code - Other Criminal', 'NCIC Virtual Code - Sex',
'NCIC Virtual Code - Burglary',
'NCIC Virtual Code - Stolen Property',
'NCIC Virtual Code - Robbery', 'NCIC Virtual Code - Kidnap',
'NCIC Virtual Code - Forgery/Fraud',
'Simple Misdemeanor - Assault', 'Simple Misdemeanor - Vandalism',
'Serious Misdemeanor - Trafficking', 'D Felony - Other Property'],
dtype=object)
# Testing for nulls
merged1.info() # still lots of nulls
<class 'pandas.core.frame.DataFrame'> Int64Index: 77472 entries, 0 to 77471 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Record ID_r 75801 non-null float64 1 Offender Number 77472 non-null int64 2 Release Date_r 75801 non-null datetime64[ns] 3 Fiscal Year Released_r 72039 non-null float64 4 Closure Type_r 75801 non-null object 5 Supervision Status_r 75801 non-null object 6 Sex_r 75801 non-null object 7 Race & Ethnicity_r 75801 non-null object 8 Age at Release_r 75801 non-null float64 9 Offense Code_r 75794 non-null object 10 Offense Classification_r 75794 non-null object 11 Offense Description_r 75794 non-null object 12 Offense Type_r 75791 non-null object 13 Offense Subtype_r 75794 non-null object 14 Admission Date_r 75801 non-null object 15 Months Served_r 75801 non-null float64 16 Served Mos Rounded_r 75801 non-null float64 17 Age in Mos_r 75801 non-null float64 18 Convicted Age_r 75801 non-null float64 19 Record Date 31486 non-null object 20 Fiscal Year Revoked 58802 non-null float64 21 Revocation Type 61611 non-null object 22 Status 31486 non-null object 23 Sex 77472 non-null object 24 Race & Ethnicity 77471 non-null object 25 Age 61611 non-null float64 26 Offense Class 31486 non-null object 27 Offense Description 61611 non-null object 28 Offense Type 61611 non-null object 29 Offense Subtype 61611 non-null object 30 Offense Code 31486 non-null object 31 Record ID 30125 non-null float64 32 Revocation Date 30123 non-null datetime64[ns] 33 Status Revoked 30125 non-null object 34 Offense Classification 30125 non-null object 35 FY 77472 non-null int64 36 FY_r 77472 non-null int64 37 off_combo_r 75794 non-null object 38 off_combo 30125 non-null object 39 offense_classification 77472 non-null object 40 off_combo_l 30125 non-null object 41 off_combo_s 31486 non-null object 42 off_combo_u 75801 non-null object dtypes: datetime64[ns](2), float64(10), int64(3), object(28) memory usage: 26.0+ MB
merged1.head()
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | ... | Status Revoked | Offense Classification | FY | FY_r | off_combo_r | off_combo | offense_classification | off_combo_l | off_combo_s | off_combo_u | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1201637 | 2020-03-04 | 2020.0 | Death | Prison | Male | White | 71.0 | 707.2 | ... | NaN | NaN | 2020 | 2020 | A Felony - Murder/Manslaughter | NaN | A Felony - Murder/Manslaughter | NaN | NaN | Unknown - Death |
| 1 | 241.0 | 2412596 | 2015-01-03 | 2015.0 | Death | Prison | Male | White | 65.0 | 707.2 | ... | NaN | NaN | 2015 | 2015 | A Felony - Murder/Manslaughter | NaN | A Felony - Murder/Manslaughter | NaN | NaN | Unknown - Death |
| 2 | 387.0 | 18048653 | 2016-06-19 | 2016.0 | Death | Prison | Male | White | 51.0 | 709.3 | ... | NaN | NaN | 2016 | 2016 | B Felony - Sex | NaN | B Felony - Sex | NaN | NaN | Unknown - Death |
| 3 | 484.0 | 161060 | 2019-07-19 | 2020.0 | Death | Prison | Male | White | 70.0 | 707.2 | ... | NaN | NaN | 2020 | 2020 | A Felony - Murder/Manslaughter | NaN | A Felony - Murder/Manslaughter | NaN | NaN | Unknown - Death |
| 4 | 531.0 | 20094065 | 2025-01-22 | NaN | Death | Prison | Male | White | 84.0 | 707.3 | ... | NaN | NaN | 2025 | 2025 | B Felony - Murder/Manslaughter | NaN | B Felony - Murder/Manslaughter | NaN | NaN | Unknown - Death |
5 rows × 43 columns
merged1.tail()
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Sex_r | Race & Ethnicity_r | Age at Release_r | Offense Code_r | ... | Status Revoked | Offense Classification | FY | FY_r | off_combo_r | off_combo | offense_classification | off_combo_l | off_combo_s | off_combo_u | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 77467 | NaN | 3366848 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Probation | D Felony | -1 | -1 | NaN | D Felony - Traffic | D Felony - Traffic | D Felony - Public Order | NaN | NaN |
| 77468 | NaN | 18470921 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Probation | Aggravated Misdemeanor | -1 | -1 | NaN | Aggravated Misdemeanor - Other Violent | Aggravated Misdemeanor - Other Violent | Aggravated Misdemeanor - Violent | NaN | NaN |
| 77469 | NaN | 19329053 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Probation | D Felony | -1 | -1 | NaN | D Felony - Forgery/Fraud | D Felony - Forgery/Fraud | D Felony - Property | NaN | NaN |
| 77470 | NaN | 20535884 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Probation | D Felony | -1 | -1 | NaN | D Felony - Vandalism | D Felony - Vandalism | D Felony - Property | NaN | NaN |
| 77471 | NaN | 19520360 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Probation | C Felony | -1 | -1 | NaN | C Felony - Theft | C Felony - Theft | C Felony - Property | NaN | NaN |
5 rows × 43 columns
#Drop some columns
merged1 = merged1.drop(columns=['Sex_r', 'Race & Ethnicity_r', 'FY_r', 'off_combo_r', 'off_combo_l', 'off_combo_s', 'off_combo_u'])
# Fill FY with Fiscal Year Revoked
merged1['FY'] = merged1['FY'].fillna(merged1['Fiscal Year Revoked'])
find_nulls = merged1.loc[merged1['FY'].isnull()]
find_nulls
# No more nulls in FY
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | ... | Offense Type | Offense Subtype | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | off_combo | offense_classification |
|---|
0 rows × 36 columns
# Add a column to collect the various age columns
merged1['convicted_age'] = pd.Series(dtype='int')
# Start w the easy one, just copying convicted age that was in the released data
def fill_convicted_age(group):
group['convicted_age'] = group['convicted_age'].fillna(group['Convicted Age_r'])
return group
merged1 = merged1.groupby('Offender Number').apply(fill_convicted_age).reset_index(drop=True)
merged1.head()
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | ... | Offense Subtype | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | off_combo | offense_classification | convicted_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1201637 | 2020-03-04 | 2020.0 | Death | Prison | 71.0 | 707.2 | A Felony | MURDER 1ST DEGREE | ... | NaN | NaN | NaN | NaT | NaN | NaN | 2020 | NaN | A Felony - Murder/Manslaughter | 30.0 |
| 1 | 241.0 | 2412596 | 2015-01-03 | 2015.0 | Death | Prison | 65.0 | 707.2 | A Felony | MURDER 1ST DEGREE | ... | NaN | NaN | NaN | NaT | NaN | NaN | 2015 | NaN | A Felony - Murder/Manslaughter | 38.0 |
| 2 | 387.0 | 18048653 | 2016-06-19 | 2016.0 | Death | Prison | 51.0 | 709.3 | B Felony | SEXUAL ABUSE, 2ND DEGREE, 85% | ... | NaN | NaN | NaN | NaT | NaN | NaN | 2016 | NaN | B Felony - Sex | 44.0 |
| 3 | 484.0 | 161060 | 2019-07-19 | 2020.0 | Death | Prison | 70.0 | 707.2 | A Felony | MURDER 1ST DEGREE | ... | NaN | NaN | NaN | NaT | NaN | NaN | 2020 | NaN | A Felony - Murder/Manslaughter | 64.0 |
| 4 | 531.0 | 20094065 | 2025-01-22 | NaN | Death | Prison | 84.0 | 707.3 | B Felony | MURDER - 2ND DEGREE, 85% | ... | NaN | NaN | NaN | NaT | NaN | NaN | 2025 | NaN | B Felony - Murder/Manslaughter | 72.0 |
5 rows × 37 columns
# Check an offender
merged1[merged1['Offender Number'] == 18849251]
# dates + age makes sense
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | ... | Offense Subtype | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | off_combo | offense_classification | convicted_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 66140 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 |
| 66141 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 |
| 66142 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Assault | 708.2A(4) | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 |
| 66143 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Assault | 708.4(2) | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 |
| 66144 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 |
| 66145 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 |
| 66146 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 |
| 66147 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Assault | 708.2A(4) | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 |
| 66148 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Assault | 708.4(2) | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 |
| 66149 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 |
| 66150 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 |
| 66151 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 |
| 66152 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Assault | 708.2A(4) | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 |
| 66153 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Assault | 708.4(2) | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 |
| 66154 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 |
| 66155 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66156 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66157 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Assault | 708.2A(4) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66158 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Assault | 708.4(2) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66159 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66160 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66161 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66162 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Assault | 708.2A(4) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66163 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Assault | 708.4(2) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
| 66164 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | Sex | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 |
25 rows × 37 columns
merged1['test_age'] = pd.Series(dtype='int')
merged1['test_age'] = merged1['test_age'].fillna(merged1['convicted_age'])
merged1['test_age'] = merged1['test_age'].fillna(merged1['Age'])
# test case
merged1[merged1['Offender Number'] == 18849251] #still matching
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | ... | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | off_combo | offense_classification | convicted_age | test_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 66140 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 | 31.0 |
| 66141 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 | 31.0 |
| 66142 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 708.2A(4) | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 | 31.0 |
| 66143 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 708.4(2) | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 | 31.0 |
| 66144 | 11933.0 | 18849251 | 2018-07-10 | 2019.0 | Released to Special Sentence | Prison | 32.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | D Felony - Assault | 31.0 | 31.0 |
| 66145 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 903B.1 | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 | 33.0 |
| 66146 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 903B.1 | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 | 33.0 |
| 66147 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 708.2A(4) | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 | 33.0 |
| 66148 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 708.4(2) | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 | 33.0 |
| 66149 | 30293.0 | 18849251 | 2020-01-31 | 2020.0 | Released to Special Sentence | Prison | 34.0 | 708.4(2) | D Felony | WILLFUL INJURY - CAUSING BODILY INJURY | ... | 903B.1 | NaN | NaT | NaN | NaN | 2020 | NaN | D Felony - Assault | 33.0 | 33.0 |
| 66150 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 | 33.0 |
| 66151 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 | 33.0 |
| 66152 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 708.2A(4) | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 | 33.0 |
| 66153 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 708.4(2) | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 | 33.0 |
| 66154 | 43267.0 | 18849251 | 2018-12-17 | 2019.0 | Revoked | Special Sentence | 33.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2019 | NaN | Special Sentence 2005 - Sex | 33.0 | 33.0 |
| 66155 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66156 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66157 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 708.2A(4) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66158 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 708.4(2) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66159 | 13573.0 | 18849251 | 2015-12-31 | 2016.0 | Work Unit - Transfer from County Jail Contract | Special Sentence | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66160 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66161 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66162 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 708.2A(4) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66163 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 708.4(2) | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
| 66164 | 42131.0 | 18849251 | 2015-08-28 | 2016.0 | Work Unit - Transfer from County Jail Contract | Work Release | 30.0 | 903B.1 | Special Sentence 2005 | Special Supervision Sentence-B or C Felony/Lif... | ... | 903B.1 | NaN | NaT | NaN | NaN | 2016 | NaN | Special Sentence 2005 - Sex | 30.0 | 30.0 |
25 rows × 38 columns
# Try another case that's not from released
merged1[merged1['Offender Number'] == 18832127]
# This person should be older in the 2nd line if going by revoke date vs record date
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | ... | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | off_combo | offense_classification | convicted_age | test_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 75928 | NaN | 18832127 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 321J.2(2)(c) | NaN | NaT | NaN | NaN | -1 | NaN | D Felony - OWI | NaN | 34.0 |
| 75929 | NaN | 18832127 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 14925.0 | 2024-09-25 | Probation | D Felony | -1 | D Felony - Traffic | D Felony - Traffic | NaN | 32.0 |
2 rows × 38 columns
#merged1['convicted_age'] = merged1['convicted_age'].astype(int)
def get_row_with_overall_min(df, offender_col, value_cols, related_cols):
"""
Finds the row with the overall maximum value across specified columns for each offender
and retrieves the corresponding values from the related columns.
Args:
df (pd.DataFrame): The input DataFrame.
offender_col (str): The name of the column identifying the grouping variable.
value_cols (list of str): A list of column names containing the values to compare.
related_cols (list of str): A list of column names to retrieve from the row
containing the overall maximum value.
Returns:
pd.DataFrame: A DataFrame with the offender column and the corresponding
values from the related columns for the row with the overall maximum.
"""
# Convert value columns to numeric, coercing errors
for col in value_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Stack the value columns to easily find the overall maximum
df_stacked = df.melt(id_vars=[offender_col] + related_cols, value_vars=value_cols, var_name='value_col', value_name='value')
# Find the index of the row with the maximum value for each offender
idx = df_stacked.loc[df_stacked.groupby(offender_col)['value'].idxmin()]
# Select the offender column and the related columns
result_df = idx[[offender_col] + related_cols].reset_index(drop=True)
return result_df
# Define the parameters
offender_column = 'Offender Number'
value_columns = ['FY']
related_columns = ['Sex', 'Race & Ethnicity', 'offense_classification', 'convicted_age', 'test_age']
# Call the new min function
sentence_min = get_row_with_overall_min(merged1.copy(), offender_column, value_columns, related_columns)
sentence_min
| Offender Number | Sex | Race & Ethnicity | offense_classification | convicted_age | test_age | |
|---|---|---|---|---|---|---|
| 0 | 719 | Male | White | B Felony - Trafficking | 47.0 | 47.0 |
| 1 | 1190 | Male | White | D Felony - Burglary | 55.0 | 55.0 |
| 2 | 1355 | Male | White | Aggravated Misdemeanor - Assault | 54.0 | 54.0 |
| 3 | 1478 | Male | White | D Felony - Vandalism | 48.0 | 48.0 |
| 4 | 1856 | Male | White | D Felony - OWI | 66.0 | 66.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 33407 | 20999486 | Male | White | D Felony - Assault | 27.0 | 27.0 |
| 33408 | 20999729 | Male | White | C Felony - Assault | 22.0 | 22.0 |
| 33409 | 20999813 | Male | White | C Felony - Sex | 25.0 | 25.0 |
| 33410 | 22800377 | Male | White | C Felony - Assault | 56.0 | 56.0 |
| 33411 | 29703959 | Male | Hispanic | D Felony - Burglary | 38.0 | 38.0 |
33412 rows × 6 columns
rep_offender = merged1[merged1['Offender Number'] == 20081486]
rep_offender
# Makes sense when calc 1st line calcs to 64.0833; 2nd line calcs to 63.8333
| Record ID_r | Offender Number | Release Date_r | Fiscal Year Released_r | Closure Type_r | Supervision Status_r | Age at Release_r | Offense Code_r | Offense Classification_r | Offense Description_r | ... | Offense Code | Record ID | Revocation Date | Status Revoked | Offense Classification | FY | off_combo | offense_classification | convicted_age | test_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 17 | 3994.0 | 20081486 | 2022-01-16 | 2022.0 | Death | Prison | 65.0 | 709.8(2) | C Felony | Lascivious Acts With Child-Life Special Sentence | ... | NaN | NaN | NaT | NaN | NaN | 2022 | NaN | C Felony - Sex | 64.0 | 64.0 |
1 rows × 38 columns
sentence_min['offense_classification'] = merged1['offense_classification']
sentence_min
| Offender Number | Sex | Race & Ethnicity | offense_classification | convicted_age | test_age | |
|---|---|---|---|---|---|---|
| 0 | 719 | Male | White | A Felony - Murder/Manslaughter | 47.0 | 47.0 |
| 1 | 1190 | Male | White | A Felony - Murder/Manslaughter | 55.0 | 55.0 |
| 2 | 1355 | Male | White | B Felony - Sex | 54.0 | 54.0 |
| 3 | 1478 | Male | White | A Felony - Murder/Manslaughter | 48.0 | 48.0 |
| 4 | 1856 | Male | White | B Felony - Murder/Manslaughter | 66.0 | 66.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 33407 | 20999486 | Male | White | C Felony - Trafficking | 27.0 | 27.0 |
| 33408 | 20999729 | Male | White | C Felony - Trafficking | 22.0 | 22.0 |
| 33409 | 20999813 | Male | White | C Felony - Trafficking | 25.0 | 25.0 |
| 33410 | 22800377 | Male | White | C Felony - Trafficking | 56.0 | 56.0 |
| 33411 | 29703959 | Male | Hispanic | Aggravated Misdemeanor - Burglary | 38.0 | 38.0 |
33412 rows × 6 columns
#Load the foster population spreadsheet
foster_pop = pd.read_csv('Children in foster care.csv')
foster_pop
| TimeFrame | Data | Male | Female | Hispanic or Latino | American Indian | Asian/Native Hawaiian | Black | Multiple race groups | White | Race unknown | <1 | 1 to 5 | 6 to 10 | 11 to 15 | 16 to 20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 5918 | 3240 | 2671 | 591.80 | 118.36 | 59.18 | 769.34 | 295.90 | 3905.88 | 177.54 | 355.08 | 1775.40 | 1183.60 | 1420.32 | 1183.60 |
| 1 | 2016 | 6004 | 3272 | 2732 | 600.40 | 120.08 | 60.04 | 780.52 | 300.20 | 3962.64 | 240.16 | 360.24 | 1981.32 | 1320.88 | 1380.92 | 1020.68 |
| 2 | 2017 | 5952 | 3098 | 2854 | 535.68 | 59.52 | 59.52 | 654.72 | 297.60 | 4106.88 | 238.08 | 416.64 | 2142.72 | 1428.48 | 1309.44 | 654.72 |
| 3 | 2018 | 6249 | 3255 | 2988 | 562.41 | 124.98 | 62.49 | 812.37 | 312.45 | 4124.34 | 249.96 | 369.91 | 1854.11 | 1229.02 | 1243.97 | 744.06 |
| 4 | 2019 | 5955 | 3076 | 2872 | 535.95 | 119.10 | 59.55 | 714.60 | 357.30 | 3930.30 | 297.75 | 476.40 | 2143.80 | 1369.65 | 1369.65 | 595.50 |
| 5 | 2020 | 4660 | 2446 | 2210 | 419.40 | 93.20 | 46.60 | 605.80 | 279.60 | 3122.20 | 93.20 | 279.60 | 1631.00 | 1118.40 | 1071.80 | 512.60 |
| 6 | 2021 | 4144 | 2124 | 2017 | 372.96 | 82.88 | 41.44 | 621.60 | 248.64 | 2735.04 | 82.88 | 331.52 | 1450.40 | 953.12 | 911.68 | 497.28 |
foster_pop.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7 entries, 0 to 6 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TimeFrame 7 non-null int64 1 Data 7 non-null int64 2 Male 7 non-null int64 3 Female 7 non-null int64 4 Hispanic or Latino 7 non-null float64 5 American Indian 7 non-null float64 6 Asian/Native Hawaiian 7 non-null float64 7 Black 7 non-null float64 8 Multiple race groups 7 non-null float64 9 White 7 non-null float64 10 Race unknown 7 non-null float64 11 <1 7 non-null float64 12 1 to 5 7 non-null float64 13 6 to 10 7 non-null float64 14 11 to 15 7 non-null float64 15 16 to 20 7 non-null float64 dtypes: float64(12), int64(4) memory usage: 1.0 KB
sentence_min.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33412 entries, 0 to 33411 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Offender Number 33412 non-null int64 1 Sex 33412 non-null object 2 Race & Ethnicity 33411 non-null object 3 offense_classification 33412 non-null object 4 convicted_age 31755 non-null float64 5 test_age 33412 non-null float64 dtypes: float64(2), int64(1), object(3) memory usage: 1.5+ MB
# Change test_age to int
sentence_min['test_age'] = sentence_min['test_age'].astype(int)
sentence_min.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33412 entries, 0 to 33411 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Offender Number 33412 non-null int64 1 Sex 33412 non-null object 2 Race & Ethnicity 33411 non-null object 3 offense_classification 33412 non-null object 4 convicted_age 31755 non-null float64 5 test_age 33412 non-null int64 dtypes: float64(1), int64(2), object(3) memory usage: 1.5+ MB
col_val_list = sentence_min['Sex'].unique()
col_val_list
array(['Male', 'Female'], dtype=object)
#Adding a female feature to the min sentence set
sentence_min['female'] = sentence_min['Sex'].replace({'Female': 1, 'Male': 0, 'Unknown': 3})
sentence_min['female'] = sentence_min['female'].astype(int)
sentence_min.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33412 entries, 0 to 33411 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Offender Number 33412 non-null int64 1 Sex 33412 non-null object 2 Race & Ethnicity 33411 non-null object 3 offense_classification 33412 non-null object 4 convicted_age 31755 non-null float64 5 test_age 33412 non-null int64 6 female 33412 non-null int64 dtypes: float64(1), int64(3), object(3) memory usage: 1.8+ MB
col_val_list = sentence_min['offense_classification'].unique()
col_val_list
array(['A Felony - Murder/Manslaughter', 'B Felony - Sex',
'B Felony - Murder/Manslaughter', 'A Felony - Kidnap',
'C Felony - Assault', 'Other Felony - Other Criminal',
'Aggravated Misdemeanor - Theft', 'C Felony - Sex',
'NCIC Virtual Code - Other Violent',
'C Felony - Murder/Manslaughter', 'C Felony - Theft',
'Felony - Enhancement to Original Penalty - Sex',
'Other Felony - Murder/Manslaughter', 'D Felony - Other Criminal',
'B Felony - Robbery', 'C Felony - Trafficking',
'D Felony - Drug Possession', 'D Felony - Burglary',
'Felony - Enhancement to Original Penalty - Trafficking',
'Special Sentence 2005 - Sex', 'C Felony - Burglary',
'Other Felony - Sex', 'A Felony - Sex',
'Aggravated Misdemeanor - OWI', 'C Felony - Robbery',
'D Felony - OWI', 'Aggravated Misdemeanor - Assault',
'C Felony - Vandalism', 'D Felony - Assault',
'D Felony - Forgery/Fraud', 'D Felony - Weapons',
'B Felony - Other Violent', 'B Felony - Trafficking',
'B Felony - Kidnap', 'D Felony - Other Public Order',
'D Felony - Theft', 'Aggravated Misdemeanor - Other Criminal',
'Aggravated Misdemeanor - Drug Possession', 'B Felony - Arson',
'Aggravated Misdemeanor - Traffic', 'B Felony - Other Criminal',
'C Felony - Arson', 'C Felony - Other Violent',
'NCIC Virtual Code - Murder/Manslaughter',
'D Felony - Trafficking', 'Aggravated Misdemeanor - Animals',
'Serious Misdemeanor - Assault',
'Aggravated Misdemeanor - Burglary',
'Aggravated Misdemeanor - Other Violent',
'Aggravated Misdemeanor - Alcohol', 'D Felony - Other Violent',
'Aggravated Misdemeanor - Weapons', 'D Felony - Traffic',
'D Felony - Other Drug', 'Aggravated Misdemeanor - Sex',
'D Felony - Kidnap', 'D Felony - Flight/Escape',
'Aggravated Misdemeanor - Other Public Order',
'Aggravated Misdemeanor - Vandalism',
'Aggravated Misdemeanor - Forgery/Fraud',
'Aggravated Misdemeanor - Prostitution/Pimping',
'D Felony - Vandalism', 'Serious Misdemeanor - Theft',
'D Felony - Murder/Manslaughter', 'C Felony - Other Criminal',
'D Felony - Arson', 'Aggravated Misdemeanor - Arson',
'C Felony - Other Public Order',
'Serious Misdemeanor - Drug Possession', 'C Felony - Traffic',
'Aggravated Misdemeanor - Kidnap', 'C Felony - Kidnap',
'D Felony - Sex', 'Serious Misdemeanor - Other Property',
'Aggravated Misdemeanor - Trafficking',
'Serious Misdemeanor - Other Public Order',
'Aggravated Misdemeanor - Robbery',
'Serious Misdemeanor - Flight/Escape',
'Serious Misdemeanor - Traffic',
'Aggravated Misdemeanor - Murder/Manslaughter',
'Serious Misdemeanor - Vandalism', 'Serious Misdemeanor - OWI',
'Simple Misdemeanor - Other Public Order',
'C Felony - Forgery/Fraud', 'C Felony - Other Drug',
'Felony - Mandatory Minimum - Sex', 'Serious Misdemeanor - Arson',
'Serious Misdemeanor - Sex', 'Serious Misdemeanor - Alcohol',
'C Felony - Weapons', 'Unknown - Returned to Sending Jurisdiction',
'Aggravated Misdemeanor - Flight/Escape',
'Felony - Mandatory Minimum - Other Government',
'D Felony - Prostitution/Pimping',
'Felony - Enhancement to Original Penalty - Other Criminal',
'NCIC Virtual Code - Other Drug', 'C Felony - Drug Possession',
'Aggravated Misdemeanor - Other Drug',
'Simple Misdemeanor - Burglary'], dtype=object)
# Splitting offense one time on the dash (-)
def split_offense(offense):
return pd.Series(offense.split(" - ", 1))
# Now has a class of crime and a specificiation or type of crime
sentence_min[['class', 'specification']] = sentence_min['offense_classification'].apply(split_offense)
sentence_min.head()
| Offender Number | Sex | Race & Ethnicity | offense_classification | convicted_age | test_age | female | class | specification | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 719 | Male | White | A Felony - Murder/Manslaughter | 47.0 | 47 | 0 | A Felony | Murder/Manslaughter |
| 1 | 1190 | Male | White | A Felony - Murder/Manslaughter | 55.0 | 55 | 0 | A Felony | Murder/Manslaughter |
| 2 | 1355 | Male | White | B Felony - Sex | 54.0 | 54 | 0 | B Felony | Sex |
| 3 | 1478 | Male | White | A Felony - Murder/Manslaughter | 48.0 | 48 | 0 | A Felony | Murder/Manslaughter |
| 4 | 1856 | Male | White | B Felony - Murder/Manslaughter | 66.0 | 66 | 0 | B Felony | Murder/Manslaughter |
col_val_list = sentence_min['class'].unique()
col_val_list # Number of types of offenses are greatly reduced
array(['A Felony', 'B Felony', 'C Felony', 'Other Felony',
'Aggravated Misdemeanor', 'NCIC Virtual Code', 'Felony',
'D Felony', 'Special Sentence 2005', 'Serious Misdemeanor',
'Simple Misdemeanor', 'Unknown'], dtype=object)
# Features to predict labels
all_features = ['Race & Ethnicity', 'test_age', 'female']
# Convert categorical features to numerical
cat_features = ['Race & Ethnicity']
# Select the above features and encode cat_features
sentence_min_X = pd.get_dummies(sentence_min[all_features], prefix = cat_features, columns = cat_features)
sentence_min_X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33412 entries, 0 to 33411 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 test_age 33412 non-null int64 1 female 33412 non-null int64 2 Race & Ethnicity_American Indian or Alaska Native 33412 non-null uint8 3 Race & Ethnicity_Asian or Pacific Islander 33412 non-null uint8 4 Race & Ethnicity_Black 33412 non-null uint8 5 Race & Ethnicity_Hispanic 33412 non-null uint8 6 Race & Ethnicity_Unknown 33412 non-null uint8 7 Race & Ethnicity_White 33412 non-null uint8 dtypes: int64(2), uint8(6) memory usage: 718.0 KB
# Encode label
label_encoder = LabelEncoder()
sentence_min_y = label_encoder.fit_transform(sentence_min['offense_classification'])
sentence_min_y = label_encoder.fit_transform(sentence_min['offense_classification'])
print(label_encoder.classes_)
print(sentence_min_y)
['A Felony - Kidnap' 'A Felony - Murder/Manslaughter' 'A Felony - Sex' 'Aggravated Misdemeanor - Alcohol' 'Aggravated Misdemeanor - Animals' 'Aggravated Misdemeanor - Arson' 'Aggravated Misdemeanor - Assault' 'Aggravated Misdemeanor - Burglary' 'Aggravated Misdemeanor - Drug Possession' 'Aggravated Misdemeanor - Flight/Escape' 'Aggravated Misdemeanor - Forgery/Fraud' 'Aggravated Misdemeanor - Kidnap' 'Aggravated Misdemeanor - Murder/Manslaughter' 'Aggravated Misdemeanor - OWI' 'Aggravated Misdemeanor - Other Criminal' 'Aggravated Misdemeanor - Other Drug' 'Aggravated Misdemeanor - Other Public Order' 'Aggravated Misdemeanor - Other Violent' 'Aggravated Misdemeanor - Prostitution/Pimping' 'Aggravated Misdemeanor - Robbery' 'Aggravated Misdemeanor - Sex' 'Aggravated Misdemeanor - Theft' 'Aggravated Misdemeanor - Traffic' 'Aggravated Misdemeanor - Trafficking' 'Aggravated Misdemeanor - Vandalism' 'Aggravated Misdemeanor - Weapons' 'B Felony - Arson' 'B Felony - Kidnap' 'B Felony - Murder/Manslaughter' 'B Felony - Other Criminal' 'B Felony - Other Violent' 'B Felony - Robbery' 'B Felony - Sex' 'B Felony - Trafficking' 'C Felony - Arson' 'C Felony - Assault' 'C Felony - Burglary' 'C Felony - Drug Possession' 'C Felony - Forgery/Fraud' 'C Felony - Kidnap' 'C Felony - Murder/Manslaughter' 'C Felony - Other Criminal' 'C Felony - Other Drug' 'C Felony - Other Public Order' 'C Felony - Other Violent' 'C Felony - Robbery' 'C Felony - Sex' 'C Felony - Theft' 'C Felony - Traffic' 'C Felony - Trafficking' 'C Felony - Vandalism' 'C Felony - Weapons' 'D Felony - Arson' 'D Felony - Assault' 'D Felony - Burglary' 'D Felony - Drug Possession' 'D Felony - Flight/Escape' 'D Felony - Forgery/Fraud' 'D Felony - Kidnap' 'D Felony - Murder/Manslaughter' 'D Felony - OWI' 'D Felony - Other Criminal' 'D Felony - Other Drug' 'D Felony - Other Public Order' 'D Felony - Other Violent' 'D Felony - Prostitution/Pimping' 'D Felony - Sex' 'D Felony - Theft' 'D Felony - Traffic' 'D Felony - Trafficking' 'D Felony - Vandalism' 'D Felony - Weapons' 'Felony - Enhancement to Original Penalty - Other Criminal' 'Felony - Enhancement to Original Penalty - Sex' 'Felony - Enhancement to Original Penalty - Trafficking' 'Felony - Mandatory Minimum - Other Government' 'Felony - Mandatory Minimum - Sex' 'NCIC Virtual Code - Murder/Manslaughter' 'NCIC Virtual Code - Other Drug' 'NCIC Virtual Code - Other Violent' 'Other Felony - Murder/Manslaughter' 'Other Felony - Other Criminal' 'Other Felony - Sex' 'Serious Misdemeanor - Alcohol' 'Serious Misdemeanor - Arson' 'Serious Misdemeanor - Assault' 'Serious Misdemeanor - Drug Possession' 'Serious Misdemeanor - Flight/Escape' 'Serious Misdemeanor - OWI' 'Serious Misdemeanor - Other Property' 'Serious Misdemeanor - Other Public Order' 'Serious Misdemeanor - Sex' 'Serious Misdemeanor - Theft' 'Serious Misdemeanor - Traffic' 'Serious Misdemeanor - Vandalism' 'Simple Misdemeanor - Burglary' 'Simple Misdemeanor - Other Public Order' 'Special Sentence 2005 - Sex' 'Unknown - Returned to Sending Jurisdiction'] [ 1 1 32 ... 49 49 7]
# Instantiate the Random Forest
sentence_min_rf = RandomForestClassifier(n_estimators = 500, oob_score = True, random_state = 12345)
# using a random_state so can keep results consistent, changing n_estimators didn't have a great impact on results
# Train the Random Forest Classifier
sentence_min_rf.fit(sentence_min_X, sentence_min_y)
print(f'Estimated accuracy with OOB data: {sentence_min_rf.oob_score_:.4f}')
print(f'Training data accuracy: {sentence_min_rf.score(sentence_min_X, sentence_min_y):.4f}')
# 9% results w OOB, 12% - not great numbers
Estimated accuracy with OOB data: 0.0920 Training data accuracy: 0.1209
# This is an extremely low score, I'm going to divide the offense classification back to Felony, Aggravated...
# as the org offenses is too much data, I'm going to divide at the - and encode the first part
col_val_list = sentence_min['class'].unique()
col_val_list # Number of types of offenses are greatly reduced
array(['A Felony', 'B Felony', 'C Felony', 'Other Felony',
'Aggravated Misdemeanor', 'NCIC Virtual Code', 'Felony',
'D Felony', 'Special Sentence 2005', 'Serious Misdemeanor',
'Simple Misdemeanor', 'Unknown'], dtype=object)
# Remove Letter from before "Felony"
sentence_min.loc[sentence_min['class'].str.contains('Felony'), 'class'] = 'Felony'
sentence_min.head()
| Offender Number | Sex | Race & Ethnicity | offense_classification | convicted_age | test_age | female | class | specification | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 719 | Male | White | A Felony - Murder/Manslaughter | 47.0 | 47 | 0 | Felony | Murder/Manslaughter |
| 1 | 1190 | Male | White | A Felony - Murder/Manslaughter | 55.0 | 55 | 0 | Felony | Murder/Manslaughter |
| 2 | 1355 | Male | White | B Felony - Sex | 54.0 | 54 | 0 | Felony | Sex |
| 3 | 1478 | Male | White | A Felony - Murder/Manslaughter | 48.0 | 48 | 0 | Felony | Murder/Manslaughter |
| 4 | 1856 | Male | White | B Felony - Murder/Manslaughter | 66.0 | 66 | 0 | Felony | Murder/Manslaughter |
# Encode label
label_encoder = LabelEncoder()
sentence_min_y = label_encoder.fit_transform(sentence_min['class'])
print(label_encoder.classes_)
print(sentence_min_y)
['Aggravated Misdemeanor' 'Felony' 'NCIC Virtual Code' 'Serious Misdemeanor' 'Simple Misdemeanor' 'Special Sentence 2005' 'Unknown'] [1 1 1 ... 1 1 0]
# Instantiate the Random Forest
sentence_min_rf = RandomForestClassifier(n_estimators = 500, oob_score = True, random_state = 12345)
# using a random_state so can keep results consistent, changing n_estimators didn't have a great impact on results
# Train the Random Forest Classifier
sentence_min_rf.fit(sentence_min_X, sentence_min_y)
print(f'Estimated accuracy with OOB data: {sentence_min_rf.oob_score_:.4f}')
print(f'Training data accuracy: {sentence_min_rf.score(sentence_min_X, sentence_min_y):.4f}')
# was 9% results w OOB, 12% - not great numbers; now 76% Better, increasing n_estimators dec numbers
Estimated accuracy with OOB data: 0.7626 Training data accuracy: 0.7651
sns.scatterplot(x = 'Sex', y = 'class', data = sentence_min)
plt.xticks(rotation=45, ha='right')
plt.title('Offender Sex')
plt.xlabel('Sex')
plt.ylabel('Crime Class')
plt.show()
# No relationship between class of crime and sex
sns.scatterplot(x = 'Race & Ethnicity', y = 'class', data = sentence_min)
plt.xticks(rotation=45, ha='right')
plt.title('Race & Ethnicity')
plt.xlabel('Race & Ethnicity')
plt.ylabel('Crime Class')
plt.show()
# No relationship between class of crime and race & ethnicity
sns.scatterplot(x = 'test_age', y = 'class', data = sentence_min)
plt.xticks(rotation=45, ha='right')
plt.title('Age at Conviction')
plt.xlabel('Age')
plt.ylabel('Crime Class')
plt.show()
# No relationship between class of crime and age
sentence_min.describe()
| Offender Number | convicted_age | test_age | female | |
|---|---|---|---|---|
| count | 3.341200e+04 | 31755.000000 | 33412.000000 | 33412.000000 |
| mean | 1.557457e+07 | 33.947567 | 33.782144 | 0.144260 |
| std | 7.221105e+06 | 11.003366 | 11.016811 | 0.351358 |
| min | 7.190000e+02 | 15.000000 | 15.000000 | 0.000000 |
| 25% | 1.806482e+07 | 25.000000 | 25.000000 | 0.000000 |
| 50% | 1.905745e+07 | 32.000000 | 32.000000 | 0.000000 |
| 75% | 2.005057e+07 | 41.000000 | 41.000000 | 0.000000 |
| max | 2.970396e+07 | 91.000000 | 91.000000 | 1.000000 |
k_means = KMeans(n_clusters = 2, n_init = 50, random_state = 12345)
# changing from 2 to 3 had no significant change, changing from 25 to 50 which also did nothing
sentence_min_X = sentence_min.copy()
# Copying for encoding, will use this one for k clustering
sentence_min_X = pd.get_dummies(sentence_min_X, columns = ['Race & Ethnicity'], prefix = 'race')
sentence_min_X = pd.get_dummies(sentence_min_X, columns = ['class'], prefix = 'e')
# using e for encoded
sentence_min_X.head(1)
| Offender Number | Sex | offense_classification | convicted_age | test_age | female | specification | race_American Indian or Alaska Native | race_Asian or Pacific Islander | race_Black | race_Hispanic | race_Unknown | race_White | e_Aggravated Misdemeanor | e_Felony | e_NCIC Virtual Code | e_Serious Misdemeanor | e_Simple Misdemeanor | e_Special Sentence 2005 | e_Unknown | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 719 | Male | A Felony - Murder/Manslaughter | 47.0 | 47 | 0 | Murder/Manslaughter | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
sentence_min_X.columns
Index(['Offender Number', 'Sex', 'offense_classification', 'convicted_age',
'test_age', 'female', 'specification',
'race_American Indian or Alaska Native',
'race_Asian or Pacific Islander', 'race_Black', 'race_Hispanic',
'race_Unknown', 'race_White', 'e_Aggravated Misdemeanor', 'e_Felony',
'e_NCIC Virtual Code', 'e_Serious Misdemeanor', 'e_Simple Misdemeanor',
'e_Special Sentence 2005', 'e_Unknown'],
dtype='object')
sentence_min_X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33412 entries, 0 to 33411 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Offender Number 33412 non-null int64 1 Sex 33412 non-null object 2 offense_classification 33412 non-null object 3 convicted_age 31755 non-null float64 4 test_age 33412 non-null int64 5 female 33412 non-null int64 6 specification 33412 non-null object 7 race_American Indian or Alaska Native 33412 non-null uint8 8 race_Asian or Pacific Islander 33412 non-null uint8 9 race_Black 33412 non-null uint8 10 race_Hispanic 33412 non-null uint8 11 race_Unknown 33412 non-null uint8 12 race_White 33412 non-null uint8 13 e_Aggravated Misdemeanor 33412 non-null uint8 14 e_Felony 33412 non-null uint8 15 e_NCIC Virtual Code 33412 non-null uint8 16 e_Serious Misdemeanor 33412 non-null uint8 17 e_Simple Misdemeanor 33412 non-null uint8 18 e_Special Sentence 2005 33412 non-null uint8 19 e_Unknown 33412 non-null uint8 dtypes: float64(1), int64(3), object(3), uint8(13) memory usage: 2.2+ MB
# Data for training
k_means.fit(sentence_min_X[[
'test_age', 'female',
'race_American Indian or Alaska Native',
'race_Asian or Pacific Islander', 'race_Black', 'race_Hispanic',
'race_Unknown', 'race_White', 'e_Felony', 'e_Aggravated Misdemeanor',
'e_Felony',
'e_NCIC Virtual Code', 'e_Serious Misdemeanor',
'e_Simple Misdemeanor', 'e_Special Sentence 2005', 'e_Unknown']])
KMeans(n_clusters=2, n_init=50, random_state=12345)
sentence_min_X['label'] = k_means.labels_
sentence_min_X['label'].value_counts()
0 21584 1 11828 Name: label, dtype: int64
#(ggplot(sentence_min_X, aes(x = "test_age", y = "class")) + theme_bw() + geom_point(aes(color = "factor(label)")))
(ggplot(sentence_min_X, aes(x = "test_age", y = "e_Felony")) + theme_bw() + geom_point(aes(color = "factor(label)")))
<ggplot: (8767579385562)>
(ggplot(sentence_min_X, aes(x = "test_age", y = "e_Aggravated Misdemeanor")) + theme_bw() + geom_point(aes(color = "factor(label)")))
<ggplot: (-9223363269275426279)>
(ggplot(sentence_min_X, aes(x = "test_age", y = "specification")) + theme_bw() + geom_point(aes(color = "factor(label)")))
<ggplot: (8767579251755)>
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(sentence_min_X[[
'test_age', 'female',
'race_American Indian or Alaska Native',
'race_Asian or Pacific Islander', 'race_Black', 'race_Hispanic',
'race_Unknown', 'race_White', 'e_Felony', 'e_Aggravated Misdemeanor',
'e_Felony',
'e_NCIC Virtual Code', 'e_Serious Misdemeanor',
'e_Simple Misdemeanor', 'e_Special Sentence 2005', 'e_Unknown']])
wcss.append(kmeans.inertia_) # Inertia: Sum of squared distances of samples to their closest cluster center
# Plot the elbow graph
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')
plt.show()
(ggplot(sentence_min_X, aes(x = "test_age"))
+ geom_histogram(aes(fill = "factor(label)"), binwidth=5, alpha=0.7)
+ theme_bw()
+ facet_wrap('~ label', ncol=1)
+ labs(title="Distribution of Test Age by Label",
x="Test Age",
y="Frequency",
fill="Label")
)
<ggplot: (8767579154040)>
(ggplot(sentence_min_X, aes(x = "female"))
+ geom_histogram(aes(fill = "factor(label)"), binwidth=5, alpha=0.7)
+ theme_bw()
+ facet_wrap('~ label', ncol=1) # Removed factor() here
+ labs(title="Distribution of Sex by Label",
x="Sex",
y="Frequency",
fill="Label")
)
<ggplot: (8767579046618)>
(ggplot(sentence_min_X, aes(x = "race_White"))
+ geom_histogram(aes(fill = "factor(label)"), binwidth=5, alpha=0.7)
+ theme_bw()
+ facet_wrap('~ label', ncol=1) # Removed factor() here
+ labs(title="Distribution of 1 Race by Label",
x="Race: White",
y="Frequency",
fill="Label")
)
<ggplot: (-9223363269275654873)>
(ggplot(sentence_min_X, aes(x = "race_Black"))
+ geom_histogram(aes(fill = "factor(label)"), binwidth=5, alpha=0.7)
+ theme_bw()
+ facet_wrap('~ label', ncol=1) # Removed factor() here
+ labs(title="Distribution of 1 Race by Label",
x="Race: Black",
y="Frequency",
fill="Label")
)
<ggplot: (-9223363269275942652)>
(ggplot(sentence_min_X, aes(x = "race_American Indian or Alaska Native"))
+ geom_histogram(aes(fill = "factor(label)"), binwidth=5, alpha=0.7)
+ theme_bw()
+ facet_wrap('~ label', ncol=1) # Removed factor() here
+ labs(title="Distribution of 1 Race by Label",
x="Race: American Indian or Alaska Native",
y="Frequency",
fill="Label")
)
<ggplot: (8767578743588)>
sentence_min_X.columns
Index(['Offender Number', 'Sex', 'offense_classification', 'convicted_age',
'test_age', 'female', 'specification',
'race_American Indian or Alaska Native',
'race_Asian or Pacific Islander', 'race_Black', 'race_Hispanic',
'race_Unknown', 'race_White', 'e_Aggravated Misdemeanor', 'e_Felony',
'e_NCIC Virtual Code', 'e_Serious Misdemeanor', 'e_Simple Misdemeanor',
'e_Special Sentence 2005', 'e_Unknown', 'label'],
dtype='object')
sentence_min.drop('convicted_age', axis=1, inplace=True)
col_val_list = sentence_min['class'].unique()
col_val_list
array(['Felony', 'Aggravated Misdemeanor', 'NCIC Virtual Code',
'Special Sentence 2005', 'Serious Misdemeanor',
'Simple Misdemeanor', 'Unknown'], dtype=object)
sentence_min.describe()
| Offender Number | test_age | female | |
|---|---|---|---|
| count | 3.341200e+04 | 33412.000000 | 33412.000000 |
| mean | 1.557457e+07 | 33.782144 | 0.144260 |
| std | 7.221105e+06 | 11.016811 | 0.351358 |
| min | 7.190000e+02 | 15.000000 | 0.000000 |
| 25% | 1.806482e+07 | 25.000000 | 0.000000 |
| 50% | 1.905745e+07 | 32.000000 | 0.000000 |
| 75% | 2.005057e+07 | 41.000000 | 0.000000 |
| max | 2.970396e+07 | 91.000000 | 1.000000 |
sentence_min['test_age'].mode()
0 22 dtype: int64
sentence_min['test_age'].median()
# My data isn't symmetrical
32.0
bell_fig = ffdemo.create_distplot([sentence_min["test_age"].tolist()], ["Age"], show_hist=False)
#Source: Tutorials Point, n.d.
bell_fig.show()
sentence_min.hist()
plt.show()
sentence_min['test_age'].value_counts().plot(kind='bar', figsize=(20, 6))
<matplotlib.axes._subplots.AxesSubplot at 0x7f95c4fce198>
plt.figure(figsize=(8, 6))
sentence_min['class'].value_counts().plot(kind='bar')
plt.title('Class of Convictions')
plt.xlabel('Classifiction of Crime')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels
plt.show()
# Create age ranges
sentence_min['age_group'] = pd.cut(sentence_min['test_age'], bins=[10, 20, 30, 40, 50, 60, 70 , 80, 90], right=False, labels=['20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s'])
# Create a contingency table
contingency_table = pd.crosstab(sentence_min['age_group'], sentence_min['class'])
# Plot the stacked bar chart
contingency_table.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Conviction Classification by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.legend(title='Conviction')
plt.show()
# The most common is felony, is that bc they convict of the most serious crime (or is it my data)
# Create a contingency table
contingency_table2 = pd.crosstab(sentence_min['female'], sentence_min['class'])
# Plot the stacked bar chart
contingency_table2.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Conviction Classification by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Conviction')
plt.show()
# The most common is felony, is that bc they convict of the most serious crime (or is it my data)
# Count the occurrences of each conviction class
conviction_counts = sentence_min['class'].value_counts()
# Create the bar chart
plt.figure(figsize=(10, 6)) # Adjust figure size as needed
conviction_counts.plot(kind='bar')
plt.title('Count of Conviction Classes')
plt.xlabel('Class of Conviction')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability if there are many classes
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
female_df = sentence_min[sentence_min['female'] == 1]
class_counts = female_df['class'].value_counts()
# Plot the bar chart
plt.figure(figsize=(8, 6))
class_counts.plot(kind='bar')
plt.title('Female Convictions')
plt.xlabel('Conviction Classification')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for readability
plt.show()
male_df = sentence_min[sentence_min['female'] == 0]
class_counts2 = male_df['class'].value_counts()
# Plot the bar chart
plt.figure(figsize=(8, 6))
class_counts2.plot(kind='bar')
plt.title('Male Convictions')
plt.xlabel('Conviction Classification')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for readability
plt.show()
# Count the occurrences of each conviction class
specification_counts = sentence_min['specification'].value_counts()
# Create the bar chart
plt.figure(figsize=(15, 6)) # Adjust figure size as needed
specification_counts.plot(kind='bar')
plt.title('Count of Conviction Specifications')
plt.xlabel('Type of Crime')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability if there are many classes
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
# Create a contingency table (cross-tabulation) of the two columns
contingency_table3 = pd.crosstab(sentence_min['class'], sentence_min['Race & Ethnicity'])
# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table3)
# Print the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)
# Interpret the results based on the p-value
alpha = 0.05 # Common significance level
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant association between convicted crime and race.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant association between convicted crime and race.")
# Source: Google, 2025
Chi-Square Statistic: 34.12396210847997 P-value: 0.2759040100105876 Degrees of Freedom: 30 Expected Frequencies Table: [[1.69298973e+02 7.59472030e+01 1.71988770e+03 3.51481847e+02 1.35620005e+00 5.23402807e+03] [5.71966897e+02 2.56583281e+02 5.81054222e+03 1.18746132e+03 4.58184430e+00 1.76828644e+04] [1.12088833e-01 5.02828410e-02 1.13869684e+00 2.32707791e-01 8.97907875e-04 3.46532579e+00] [5.62685942e+00 2.52419862e+00 5.71625812e+01 1.16819311e+01 4.50749753e-02 1.73959355e+02] [1.79342133e-01 8.04525456e-02 1.82191494e+00 3.72332465e-01 1.43665260e-03 5.54452127e+00] [1.79342133e+00 8.04525456e-01 1.82191494e+01 3.72332465e+00 1.43665260e-02 5.54452127e+01] [2.24177666e-02 1.00565682e-02 2.27739367e-01 4.65415582e-02 1.79581575e-04 6.93065158e-01]] Fail to reject the null hypothesis: There is no statistically significant association between convicted crime and race.
# Create a contingency table (cross-tabulation) of the two columns
contingency_table4 = pd.crosstab(sentence_min['specification'], sentence_min['Race & Ethnicity'])
# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table4)
# Print the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)
# Interpret the results based on the p-value
alpha = 0.05 # Common significance level
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant association between the focus of crime and race.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant association between focus of crime and race.")
Chi-Square Statistic: 661.8019679650164 P-value: 4.984640071876641e-67 Degrees of Freedom: 145 Expected Frequencies Table: [[6.12005028e+00 2.74544312e+00 6.21728473e+01 1.27058454e+01 4.90257700e-02 1.89206788e+02] [1.56924366e-01 7.03959774e-02 1.59417557e+00 3.25790907e-01 1.25707102e-03 4.85145611e+00] [6.09763252e+00 2.73538655e+00 6.19451079e+01 1.26593038e+01 4.88461884e-02 1.88513723e+02] [1.18567568e+02 5.31891892e+01 1.20451351e+03 2.46158301e+02 9.49806950e-01 3.66562162e+03] [9.73603604e+01 4.36756757e+01 9.89072072e+02 2.02129987e+02 7.79922780e-01 3.00998198e+03] [6.48321810e+01 2.90835952e+01 6.58622250e+02 1.34598186e+02 5.19349915e-01 2.00434444e+03] [2.24177666e-02 1.00565682e-02 2.27739367e-01 4.65415582e-02 1.79581575e-04 6.93065158e-01] [1.79342133e-01 8.04525456e-02 1.82191494e+00 3.72332465e-01 1.43665260e-03 5.54452127e+00] [1.43249529e+01 6.42614708e+00 1.45525456e+02 2.97400557e+01 1.14752626e-01 4.42868636e+02] [2.12968783e+00 9.55373979e-01 2.16352399e+01 4.42144803e+00 1.70602496e-02 6.58411900e+01] [4.26161743e+01 1.91175361e+01 4.32932537e+02 8.84755021e+01 3.41384574e-01 1.31751687e+03] [2.98156296e+00 1.33752357e+00 3.02893358e+01 6.19002724e+00 2.38843495e-02 9.21776660e+01] [2.24177666e-02 1.00565682e-02 2.27739367e-01 4.65415582e-02 1.79581575e-04 6.93065158e-01] [2.24177666e-02 1.00565682e-02 2.27739367e-01 4.65415582e-02 1.79581575e-04 6.93065158e-01] [6.29939242e+00 2.82589566e+00 6.39947622e+01 1.30781778e+01 5.04624226e-02 1.94751309e+02] [3.59356799e+01 1.61206788e+01 3.65066206e+02 7.46061177e+01 2.87869265e-01 1.11098345e+03] [1.92792793e+01 8.64864865e+00 1.95855856e+02 4.00257400e+01 1.54440154e-01 5.96036036e+02] [5.44751728e+00 2.44374607e+00 5.53406662e+01 1.13095986e+01 4.36383227e-02 1.68414833e+02] [2.24177666e-02 1.00565682e-02 2.27739367e-01 4.65415582e-02 1.79581575e-04 6.93065158e-01] [2.60046093e+01 1.16656191e+01 2.64177666e+02 5.39882075e+01 2.08314627e-01 8.03955583e+02] [1.67460717e+01 7.51225644e+00 1.70121307e+02 3.47665440e+01 1.34147436e-01 5.17719673e+02] [2.69013199e-01 1.20678818e-01 2.73287241e+00 5.58498698e-01 2.15497890e-03 8.31678190e+00] [2.24177666e-02 1.00565682e-02 2.27739367e-01 4.65415582e-02 1.79581575e-04 6.93065158e-01] [9.07919547e+00 4.07291012e+00 9.22344437e+01 1.88493311e+01 7.27305378e-02 2.80691389e+02] [1.39886864e+01 6.27529855e+00 1.42109365e+02 2.90419323e+01 1.12058903e-01 4.32472659e+02] [9.45357218e+01 4.24085481e+01 9.60376912e+02 1.96265751e+02 7.57295501e-01 2.92265577e+03] [1.66788184e+01 7.48208674e+00 1.69438089e+02 3.46269193e+01 1.33608692e-01 5.15640478e+02] [1.14263356e+02 5.12583281e+01 1.16078755e+03 2.37222322e+02 9.15327287e-01 3.53255311e+03] [1.24642782e+01 5.59145192e+00 1.26623088e+02 2.58771063e+01 9.98473557e-02 3.85344228e+02] [2.25298554e+01 1.01068510e+01 2.28878064e+02 4.67742660e+01 1.80479483e-01 6.96530484e+02]] Reject the null hypothesis: There is a statistically significant association between the focus of crime and race.
# Create a contingency table (cross-tabulation) of the two columns
contingency_table5 = pd.crosstab(sentence_min['class'], sentence_min['test_age'])
# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table5)
# Print the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)
# Interpret the results based on the p-value
alpha = 0.05 # Common significance level
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant association between convicted crime and age.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant association between convicted crime and age.")
# Source: Google, 2025
Chi-Square Statistic: 673.1274568303811 P-value: 5.045941226887007e-14 Degrees of Freedom: 420 Expected Frequencies Table: [[1.35615946e+00 1.26574883e+01 4.43012091e+01 1.54376152e+02 2.02745840e+02 2.78012690e+02 2.55636059e+02 2.90670178e+02 2.45916916e+02 2.77334610e+02 2.53827846e+02 2.90444152e+02 2.51341554e+02 2.85923620e+02 2.56314139e+02 2.80951036e+02 2.45238836e+02 2.80725009e+02 2.33259428e+02 2.73492159e+02 2.16759488e+02 2.47273076e+02 1.93704777e+02 2.17211541e+02 1.68389800e+02 1.90314378e+02 1.38328265e+02 1.64999401e+02 1.24540644e+02 1.41718664e+02 1.10979049e+02 1.27252963e+02 1.03294146e+02 1.08040704e+02 8.90544714e+01 9.89996408e+01 8.45339399e+01 8.11435412e+01 7.52668502e+01 7.95613552e+01 5.74107506e+01 5.56025380e+01 3.84245181e+01 4.45272357e+01 2.73492159e+01 3.09656411e+01 2.39588172e+01 1.87602059e+01 1.35615946e+01 1.40136478e+01 9.26708967e+00 9.49311625e+00 5.65066443e+00 7.00682390e+00 3.84245181e+00 3.16437208e+00 2.26026577e+00 2.26026577e+00 6.78079732e-01 1.80821262e+00 1.13013289e+00 1.58218604e+00 6.78079732e-01 6.78079732e-01 2.26026577e-01 2.26026577e-01 4.52053155e-01 4.52053155e-01 2.26026577e-01 2.26026577e-01 2.26026577e-01] [4.58188675e+00 4.27642763e+01 1.49674967e+02 5.21571441e+02 6.84992069e+02 9.39286783e+02 8.63685652e+02 9.82051059e+02 8.30848797e+02 9.36995840e+02 8.57576470e+02 9.81287412e+02 8.49176344e+02 9.66014456e+02 8.65976595e+02 9.49214204e+02 8.28557853e+02 9.48450557e+02 7.88084521e+02 9.24013827e+02 7.32338232e+02 8.35430684e+02 6.54446157e+02 7.33865527e+02 5.68917604e+02 6.42991440e+02 4.67352448e+02 5.57462888e+02 4.20769933e+02 4.78807165e+02 3.74951065e+02 4.29933706e+02 3.48987041e+02 3.65023644e+02 3.00877230e+02 3.34477733e+02 2.85604274e+02 2.74149557e+02 2.54294714e+02 2.68804023e+02 1.93966539e+02 1.87857357e+02 1.29820125e+02 1.50438615e+02 9.24013827e+01 1.04619747e+02 8.09466659e+01 6.33827667e+01 4.58188675e+01 4.73461631e+01 3.13095594e+01 3.20732072e+01 1.90911948e+01 2.36730815e+01 1.29820125e+01 1.06910691e+01 7.63647791e+00 7.63647791e+00 2.29094337e+00 6.10918233e+00 3.81823896e+00 5.34553454e+00 2.29094337e+00 2.29094337e+00 7.63647791e-01 7.63647791e-01 1.52729558e+00 1.52729558e+00 7.63647791e-01 7.63647791e-01 7.63647791e-01] [8.97881001e-04 8.38022267e-03 2.93307794e-02 1.02208787e-01 1.34233210e-01 1.84065605e-01 1.69250569e-01 1.92445828e-01 1.62815755e-01 1.83616665e-01 1.68053394e-01 1.92296181e-01 1.66407279e-01 1.89303244e-01 1.69699509e-01 1.86011014e-01 1.62366814e-01 1.85861367e-01 1.54435532e-01 1.81072669e-01 1.43511313e-01 1.63713636e-01 1.28247336e-01 1.43810607e-01 1.11486891e-01 1.26002634e-01 9.15838621e-02 1.09242188e-01 8.24554052e-02 9.38285646e-02 7.34765952e-02 8.42511672e-02 6.83886029e-02 7.15311864e-02 5.89608524e-02 6.55453131e-02 5.59679157e-02 5.37232132e-02 4.98323955e-02 5.26756854e-02 3.80102957e-02 3.68131210e-02 2.54399617e-02 2.94804262e-02 1.81072669e-02 2.05016162e-02 1.58625643e-02 1.24206872e-02 8.97881001e-03 9.27810368e-03 6.13552017e-03 6.28516701e-03 3.74117084e-03 4.63905184e-03 2.54399617e-03 2.09505567e-03 1.49646833e-03 1.49646833e-03 4.48940500e-04 1.19717467e-03 7.48234167e-04 1.04752783e-03 4.48940500e-04 4.48940500e-04 1.49646833e-04 1.49646833e-04 2.99293667e-04 2.99293667e-04 1.49646833e-04 1.49646833e-04 1.49646833e-04] [4.50736262e-02 4.20687178e-01 1.47240512e+00 5.13088112e+00 6.73850712e+00 9.24009338e+00 8.49637855e+00 9.66078056e+00 8.17335089e+00 9.21755657e+00 8.43628038e+00 9.65326829e+00 8.35364540e+00 9.50302287e+00 8.51891536e+00 9.33775290e+00 8.15081408e+00 9.33024063e+00 7.75266371e+00 9.08984796e+00 7.20426793e+00 8.21842452e+00 6.43801628e+00 7.21929247e+00 5.59664193e+00 6.32533222e+00 4.59750988e+00 5.48395786e+00 4.13926134e+00 4.71019394e+00 3.68852508e+00 4.22940860e+00 3.43310787e+00 3.59086556e+00 2.95983479e+00 3.29037472e+00 2.80958937e+00 2.69690530e+00 2.50158626e+00 2.64431941e+00 1.90811684e+00 1.84801868e+00 1.27708608e+00 1.47991739e+00 9.08984796e-01 1.02918113e+00 7.96300730e-01 6.23518496e-01 4.50736262e-01 4.65760805e-01 3.08003113e-01 3.15515384e-01 1.87806776e-01 2.32880402e-01 1.27708608e-01 1.05171795e-01 7.51227104e-02 7.51227104e-02 2.25368131e-02 6.00981683e-02 3.75613552e-02 5.25858973e-02 2.25368131e-02 2.25368131e-02 7.51227104e-03 7.51227104e-03 1.50245421e-02 1.50245421e-02 7.51227104e-03 7.51227104e-03 7.51227104e-03] [1.43660960e-03 1.34083563e-02 4.69292470e-02 1.63534060e-01 2.14773135e-01 2.94504968e-01 2.70800910e-01 3.07913325e-01 2.60505208e-01 2.93786663e-01 2.68885430e-01 3.07673890e-01 2.66251646e-01 3.02885191e-01 2.71519215e-01 2.97617622e-01 2.59786903e-01 2.97378187e-01 2.47096851e-01 2.89716270e-01 2.29618101e-01 2.61941817e-01 2.05195738e-01 2.30096971e-01 1.78379025e-01 2.01604214e-01 1.46534179e-01 1.74787501e-01 1.31928648e-01 1.50125703e-01 1.17562552e-01 1.34801868e-01 1.09421765e-01 1.14449898e-01 9.43373638e-02 1.04872501e-01 8.95486652e-02 8.59571411e-02 7.97318329e-02 8.42810966e-02 6.08164731e-02 5.89009937e-02 4.07039387e-02 4.71686819e-02 2.89716270e-02 3.28025859e-02 2.53801030e-02 1.98730995e-02 1.43660960e-02 1.48449659e-02 9.81683228e-03 1.00562672e-02 5.98587334e-03 7.42248294e-03 4.07039387e-03 3.35208907e-03 2.39434934e-03 2.39434934e-03 7.18304801e-04 1.91547947e-03 1.19717467e-03 1.67604453e-03 7.18304801e-04 7.18304801e-04 2.39434934e-04 2.39434934e-04 4.78869867e-04 4.78869867e-04 2.39434934e-04 2.39434934e-04 2.39434934e-04] [1.43660960e-02 1.34083563e-01 4.69292470e-01 1.63534060e+00 2.14773135e+00 2.94504968e+00 2.70800910e+00 3.07913325e+00 2.60505208e+00 2.93786663e+00 2.68885430e+00 3.07673890e+00 2.66251646e+00 3.02885191e+00 2.71519215e+00 2.97617622e+00 2.59786903e+00 2.97378187e+00 2.47096851e+00 2.89716270e+00 2.29618101e+00 2.61941817e+00 2.05195738e+00 2.30096971e+00 1.78379025e+00 2.01604214e+00 1.46534179e+00 1.74787501e+00 1.31928648e+00 1.50125703e+00 1.17562552e+00 1.34801868e+00 1.09421765e+00 1.14449898e+00 9.43373638e-01 1.04872501e+00 8.95486652e-01 8.59571411e-01 7.97318329e-01 8.42810966e-01 6.08164731e-01 5.89009937e-01 4.07039387e-01 4.71686819e-01 2.89716270e-01 3.28025859e-01 2.53801030e-01 1.98730995e-01 1.43660960e-01 1.48449659e-01 9.81683228e-02 1.00562672e-01 5.98587334e-02 7.42248294e-02 4.07039387e-02 3.35208907e-02 2.39434934e-02 2.39434934e-02 7.18304801e-03 1.91547947e-02 1.19717467e-02 1.67604453e-02 7.18304801e-03 7.18304801e-03 2.39434934e-03 2.39434934e-03 4.78869867e-03 4.78869867e-03 2.39434934e-03 2.39434934e-03 2.39434934e-03] [1.79576200e-04 1.67604453e-03 5.86615587e-03 2.04417575e-02 2.68466419e-02 3.68131210e-02 3.38501137e-02 3.84891656e-02 3.25631510e-02 3.67233329e-02 3.36106788e-02 3.84592362e-02 3.32814558e-02 3.78606489e-02 3.39399018e-02 3.72022028e-02 3.24733629e-02 3.71722734e-02 3.08871064e-02 3.62145337e-02 2.87022627e-02 3.27427272e-02 2.56494673e-02 2.87621214e-02 2.22973782e-02 2.52005268e-02 1.83167724e-02 2.18484377e-02 1.64910810e-02 1.87657129e-02 1.46953190e-02 1.68502334e-02 1.36777206e-02 1.43062373e-02 1.17921705e-02 1.31090626e-02 1.11935831e-02 1.07446426e-02 9.96647911e-03 1.05351371e-02 7.60205914e-03 7.36262421e-03 5.08799234e-03 5.89608524e-03 3.62145337e-03 4.10032324e-03 3.17251287e-03 2.48413744e-03 1.79576200e-03 1.85562074e-03 1.22710403e-03 1.25703340e-03 7.48234167e-04 9.27810368e-04 5.08799234e-04 4.19011134e-04 2.99293667e-04 2.99293667e-04 8.97881001e-05 2.39434934e-04 1.49646833e-04 2.09505567e-04 8.97881001e-05 8.97881001e-05 2.99293667e-05 2.99293667e-05 5.98587334e-05 5.98587334e-05 2.99293667e-05 2.99293667e-05 2.99293667e-05]] Reject the null hypothesis: There is a statistically significant association between convicted crime and age.
# Create a contingency table (cross-tabulation) of the two columns
contingency_table6 = pd.crosstab(sentence_min['specification'], sentence_min['test_age'])
# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table6)
# Print the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)
# Interpret the results based on the p-value
alpha = 0.05 # Common significance level
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant association between focus of crime and age.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant association between the focus of crime and age.")
# Source: Google, 2025
Chi-Square Statistic: 2564.5791708004153 P-value: 4.266869523525903e-15 Degrees of Freedom: 2030 Expected Frequencies Table: [[4.90243026e-02 4.57560158e-01 1.60146055e+00 ... 8.17071711e-03 8.17071711e-03 8.17071711e-03] [1.25703340e-03 1.17323117e-02 4.10630911e-02 ... 2.09505567e-04 2.09505567e-04 2.09505567e-04] [4.88447264e-02 4.55884113e-01 1.59559440e+00 ... 8.14078774e-03 8.14078774e-03 8.14078774e-03] ... [9.15479468e-01 8.54447504e+00 2.99056626e+01 ... 1.52579911e-01 1.52579911e-01 1.52579911e-01] [9.98443673e-02 9.31880761e-01 3.26158266e+00 ... 1.66407279e-02 1.66407279e-02 1.66407279e-02] [1.80474081e-01 1.68442476e+00 5.89548665e+00 ... 3.00790135e-02 3.00790135e-02 3.00790135e-02]] Reject the null hypothesis: There is a statistically significant association between focus of crime and age.
# Create a contingency table (cross-tabulation) of the two columns
contingency_table8 = pd.crosstab(sentence_min['class'], sentence_min['Sex'])
# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table8)
# Print the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)
# Interpret the results based on the p-value
alpha = 0.05 # Common significance level
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant association between convicted crime and sex.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant association between convicted crime and sex.")
# Source: Google, 2025
Chi-Square Statistic: 3.931890418461456 P-value: 0.6858931472788716 Degrees of Freedom: 6 Expected Frequencies Table: [[1.08944810e+03 6.46255190e+03] [3.68078235e+03 2.18342176e+04] [7.21297737e-01 4.27870226e+00] [3.62091464e+01 2.14790854e+02] [1.15407638e+00 6.84592362e+00] [1.15407638e+01 6.84592362e+01] [1.44259547e-01 8.55740453e-01]] Fail to reject the null hypothesis: There is no statistically significant association between convicted crime and sex.
# Create a contingency table (cross-tabulation) of the two columns
contingency_table9 = pd.crosstab(sentence_min['specification'], sentence_min['Sex'])
# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table9)
# Print the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)
# Interpret the results based on the p-value
alpha = 0.05 # Common significance level
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant association between focus of crime and sex.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant association between focus of crime and sex.")
# Source: Google, 2025
Chi-Square Statistic: 26.90887479217267 P-value: 0.5766216483502722 Degrees of Freedom: 29 Expected Frequencies Table: [[3.93828565e+01 2.33617144e+02] [1.00981683e+00 5.99018317e+00] [3.92385969e+01 2.32761403e+02] [7.62988747e+02 4.52601125e+03] [6.26519215e+02 3.71648079e+03] [4.17198611e+02 2.47480139e+03] [1.44259547e-01 8.55740453e-01] [1.15407638e+00 6.84592362e+00] [9.21818508e+01 5.46818149e+02] [1.37046570e+01 8.12953430e+01] [2.74237400e+02 1.62676260e+03] [1.91865198e+01 1.13813480e+02] [1.44259547e-01 8.55740453e-01] [1.44259547e-01 8.55740453e-01] [4.05369328e+01 2.40463067e+02] [2.31248055e+02 1.37175195e+03] [1.24063211e+02 7.35936789e+02] [3.50550700e+01 2.07944930e+02] [1.44259547e-01 8.55740453e-01] [1.67341075e+02 9.92658925e+02] [1.07761882e+02 6.39238118e+02] [1.73111457e+00 1.02688854e+01] [1.44259547e-01 8.55740453e-01] [5.84251167e+01 3.46574883e+02] [9.00179576e+01 5.33982042e+02] [6.08342512e+02 3.60865749e+03] [1.07329103e+02 6.36670897e+02] [7.35435173e+02 4.36256483e+03] [8.02083084e+01 4.75791692e+02] [1.44980845e+02 8.60019155e+02]] Fail to reject the null hypothesis: There is no statistically significant association between focus of crime and sex.